Dan Magenheimer
2010-Mar-18 23:13 UTC
[Xen-devel] [PATCH] [post-4.0] tmem: add page deduplication (V2)
V1: Add "page deduplication" capability to Xen-side of tmem. (see below for log notes for V1 posting) V2: Add support for optional "trailing zero elimination" (tze) and compression when page deduplication is enabled. Tze saves significant tmem RAM when many data files are less than 1-3 pages and remaining (unused) space at the end of a page is zero-filled on disk or in memory, as may be the case for example when VMs are serving a large number of (deduplicate''able) web pages. Compression already was a tmem option; v2 of this patch combines compression with deduplication to further improve tmem RAM utilization. Either option (tmem_tze or tmem_compress) can be enabled at xen boot time in addition to deduplication (tmem_dedup) but compression overrides/disables tze. Both have a significant CPU cost so are useful primarily when memory is more constrained than CPU cycles, for example on a many-core machine with many low CPU-utilization RAM-needy VMs. tools/misc/xen-tmem-list-parse.c | 25 +- xen/common/tmem.c | 468 ++++++++++++++++++++++++++++++++------- xen/common/tmem_xen.c | 35 ++ xen/include/xen/tmem_xen.h | 122 +++++++++- 4 files changed, 566 insertions(+), 84 deletions(-) Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> ===== (This is for post-4.0, but I''m posting now for feedback.) Add "page deduplication" capability to Xen-side of tmem. (Transparent to tmem-enabled guests.) Ephemeral pages that have the exact same content are "combined" so that only one page frame is needed. Since ephemeral pages are essentially read-only, no C-O-W (and thus no equivalent of swapping) is necessary. Anybody know of any good fast (SSE2?) assembly memory compare routines? (See tmh_page_cmp() in the patch.) Points of interest: - Modifications to LRU eviction algorithm to accommodate dedup''ed pages - New data structures to allow lookup of matching pages and track references. (Algorithm used is similar to that used by KSM in KVM/Linux: No hashing required.) - Lock (and rbtree) chosen by first byte of data to allow reasonably high concurrency without greatly complicating lock management. - Statistics added so "dedup ratio" can be monitored. - Dedup is disabled/enabled by Xen command line option. I''m seeing 1.08-1.52 dedup ratio for two self-ballooned guests simultaneously/continuously building linux; that''s up to a 34% reduction in physical ephemeral pages used by tmem. Clearly this is very workload-dependent. YMMV. To obtain this savings, approx double the time is spent in tmem (increasing from roughly 0.1% to roughly 0.2%). This compares favorably to compression which costs approximately 10x for an approximate savings of 50%. Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c --- a/tools/misc/xen-tmem-list-parse.c Wed Mar 03 17:41:58 2010 +0000 +++ b/tools/misc/xen-tmem-list-parse.c Thu Mar 18 14:40:50 2010 -0600 @@ -110,13 +110,34 @@ void parse_global(char *s) unsigned long long rtree_node_max = parse(s,"Nm"); unsigned long long pgp_count = parse(s,"Pc"); unsigned long long pgp_max = parse(s,"Pm"); + unsigned long long page_count = parse(s,"Fc"); + unsigned long long max_page_count = parse(s,"Fm"); + unsigned long long pcd_count = parse(s,"Sc"); + unsigned long long max_pcd_count = parse(s,"Sm"); + unsigned long long pcd_tot_tze_size = parse(s,"Zt"); + unsigned long long pcd_tot_csize = parse(s,"Gz"); printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n", total_ops, errored_ops, avail_pages); printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) " - "nodes=%llu (max=%llu)\n", + "nodes=%llu (max=%llu) pages=%llu (max=%llu) ", obj_count, obj_max, pgp_count, pgp_max, - rtree_node_count, rtree_node_max); + rtree_node_count, rtree_node_max, + page_count,max_page_count); + if (pcd_count != 0) + { + printf("pcds=%llu (max=%llu) ", + pcd_count,max_pcd_count); + printf("dedup savings=%4.2f%% ", + (1.0-(pcd_count*1.0)/global_eph_count)*100); + if (pcd_tot_tze_size && (pcd_tot_tze_size < pcd_count*PAGE_SIZE)) + printf("tze savings=%4.2f%% ", + (1.0-(pcd_tot_tze_size*1.0)/(pcd_count*PAGE_SIZE))*100); + if (pcd_tot_csize && (pcd_tot_csize < pcd_count*PAGE_SIZE)) + printf("compression savings=%4.2f%% ", + (1.0-(pcd_tot_csize*1.0)/(pcd_count*PAGE_SIZE))*100); + } + printf("\n"); printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu " "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, " "max_evicts_per_relinq=%llu, flush_pools=%llu, " diff -r b8d2a4134a68 xen/common/tmem.c --- a/xen/common/tmem.c Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem.c Thu Mar 18 14:40:50 2010 -0600 @@ -6,11 +6,10 @@ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. */ -/* TODO list: 090129 - - improve on reclamation policy +/* TODO list: 090129 (updated 100318) + - any better reclamation policy? - use different tlsf pools for each client (maybe each pool) - - implement page accounting and minimal QoS limits - - test shared access more completely (need pv cluster fs) + - test shared access more completely (ocfs2) - add feedback-driven compression (not for persistent pools though!) - add data-structure total bytes overhead stats */ @@ -79,10 +78,13 @@ static unsigned long low_on_memory = 0; static unsigned long low_on_memory = 0; static int global_obj_count_max = 0; static int global_pgp_count_max = 0; +static int global_pcd_count_max = 0; static int global_page_count_max = 0; static int global_rtree_node_count_max = 0; static long global_eph_count_max = 0; static unsigned long failed_copies; +static unsigned long pcd_tot_tze_size = 0; +static unsigned long pcd_tot_csize = 0; DECL_CYC_COUNTER(succ_get); DECL_CYC_COUNTER(succ_put); @@ -108,6 +110,7 @@ DECL_CYC_COUNTER(decompress); struct tm_pool; struct tmem_page_descriptor; +struct tmem_page_content_descriptor; struct client { struct list_head client_list; struct tm_pool *pools[MAX_POOLS_PER_DOMAIN]; @@ -219,12 +222,17 @@ struct tmem_page_descriptor { obj_t *obj; uint64_t inv_oid; /* used for invalid list only */ }; + pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid, + else compressed data (cdata) */ uint32_t index; - size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid, - else compressed data (cdata) */ + /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */ + uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */ + bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */ + struct list_head pcd_siblings; union { pfp_t *pfp; /* page frame pointer */ char *cdata; /* compressed data */ + struct tmem_page_content_descriptor *pcd; /* page dedup */ }; union { uint64_t timestamp; @@ -233,6 +241,25 @@ struct tmem_page_descriptor { DECL_SENTINEL }; typedef struct tmem_page_descriptor pgp_t; + +#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64)) + +struct tmem_page_content_descriptor { + union { + pfp_t *pfp; /* page frame pointer */ + char *cdata; /* if compression_enabled */ + char *tze; /* if !compression_enabled, trailing zeroes eliminated */ + }; + struct list_head pgp_list; + struct rb_node pcd_rb_tree_node; + uint32_t pgp_ref_count; + pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata) + * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8 + * else PAGE_SIZE -> *pfp */ +}; +typedef struct tmem_page_content_descriptor pcd_t; +struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */ +rwlock_t pcd_tree_rwlocks[256]; /* poor man''s concurrency for now */ static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */ @@ -267,6 +294,7 @@ static long global_eph_count = 0; /* ato static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */ static atomic_t global_obj_count = ATOMIC_INIT(0); static atomic_t global_pgp_count = ATOMIC_INIT(0); +static atomic_t global_pcd_count = ATOMIC_INIT(0); static atomic_t global_page_count = ATOMIC_INIT(0); static atomic_t global_rtree_node_count = ATOMIC_INIT(0); @@ -336,6 +364,227 @@ static NOINLINE void tmem_page_free(pool atomic_dec_and_assert(global_page_count); } +/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/ + +#define NOT_SHAREABLE ((uint16_t)-1UL) + +static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp) +{ + uint8_t firstbyte = pgp->firstbyte; + pcd_t *pcd; + int ret; + + ASSERT(tmh_dedup_enabled()); + tmem_read_lock(&pcd_tree_rwlocks[firstbyte]); + pcd = pgp->pcd; + if ( pgp->size < PAGE_SIZE && pcd->size < PAGE_SIZE && pcd->size != 0 ) + ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL); + else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE ) + ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size); + else + ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL); + tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]); + return ret; +} + +/* ensure pgp no longer points to pcd, nor vice-versa */ +/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */ +static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t have_pcd_rwlock) +{ + pcd_t *pcd = pgp->pcd; + pfp_t *pfp = pgp->pcd->pfp; + uint16_t firstbyte = pgp->firstbyte; + char *pcd_tze = pgp->pcd->tze; + pagesize_t pcd_size = pcd->size; + pagesize_t pgp_size = pgp->size; + char *pcd_cdata = pgp->pcd->cdata; + pagesize_t pcd_csize = pgp->pcd->size; + + ASSERT(tmh_dedup_enabled()); + ASSERT(firstbyte != NOT_SHAREABLE); + ASSERT(firstbyte < 256); + + if ( have_pcd_rwlock ) + ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]); + else + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + list_del_init(&pgp->pcd_siblings); + pgp->pcd = NULL; + pgp->firstbyte = NOT_SHAREABLE; + pgp->size = -1; + if ( --pcd->pgp_ref_count ) + { + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return; + } + + /* no more references to this pcd, recycle it and the physical page */ + ASSERT(list_empty(&pcd->pgp_list)); + pcd->pfp = NULL; + /* remove pcd from rbtree */ + rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]); + /* reinit the struct for safety for now */ + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); + /* now free up the pcd memory */ + tmem_free(pcd,sizeof(pcd_t),NULL); + atomic_dec_and_assert(global_pcd_count); + if ( pgp_size != 0 && pcd_size < PAGE_SIZE ) + { + /* compressed data */ + tmem_free(pcd_cdata,pcd_csize,pool); + pcd_tot_csize -= pcd_csize; + } + else if ( pcd_size != PAGE_SIZE ) + { + /* trailing zero data */ + pcd_tot_tze_size -= pcd_size; + if ( pcd_size ) + tmem_free(pcd_tze,pcd_size,pool); + } else { + /* real physical page */ + if ( tmh_tze_enabled() ) + pcd_tot_tze_size -= PAGE_SIZE; + if ( tmh_compression_enabled() ) + pcd_tot_csize -= PAGE_SIZE; + tmem_page_free(pool,pfp); + } + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +} + + +static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize) +{ + struct rb_node **new, *parent = NULL; + struct rb_root *root; + pcd_t *pcd; + int cmp; + pagesize_t pfp_size = 0; + uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) : *cdata; + int ret = 0; + + if ( !tmh_dedup_enabled() ) + return 0; + ASSERT(pgp->obj != NULL); + ASSERT(pgp->obj->pool != NULL); + ASSERT(!pgp->obj->pool->persistent); + if ( cdata == NULL ) + { + ASSERT(pgp->pfp != NULL); + pfp_size = PAGE_SIZE; + if ( tmh_tze_enabled() ) + { + pfp_size = tmh_tze_pfp_scan(pgp->pfp); + if ( pfp_size > PCD_TZE_MAX_SIZE ) + pfp_size = PAGE_SIZE; + } + ASSERT(pfp_size <= PAGE_SIZE); + ASSERT(!(pfp_size & (sizeof(uint64_t)-1))); + } + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + + /* look for page match */ + root = &pcd_tree_roots[firstbyte]; + new = &(root->rb_node); + while ( *new ) + { + pcd = container_of(*new, pcd_t, pcd_rb_tree_node); + parent = *new; + /* compare new entry and rbtree entry, set cmp accordingly */ + if ( cdata != NULL ) + { + if ( pcd->size < PAGE_SIZE ) + /* both new entry and rbtree entry are compressed */ + cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size); + else + /* new entry is compressed, rbtree entry is not */ + cmp = -1; + } else if ( pcd->size < PAGE_SIZE ) + /* rbtree entry is compressed, rbtree entry is not */ + cmp = 1; + else if ( tmh_tze_enabled() ) { + if ( pcd->size < PAGE_SIZE ) + /* both new entry and rbtree entry are trailing zero */ + cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size); + else + /* new entry is trailing zero, rbtree entry is not */ + cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE); + } else { + /* both new entry and rbtree entry are full physical pages */ + ASSERT(pgp->pfp != NULL); + ASSERT(pcd->pfp != NULL); + cmp = tmh_page_cmp(pgp->pfp,pcd->pfp); + } + + /* walk tree or match depending on cmp */ + if ( cmp < 0 ) + new = &((*new)->rb_left); + else if ( cmp > 0 ) + new = &((*new)->rb_right); + else + { + /* match! if not compressed, free the no-longer-needed page */ + /* but if compressed, data is assumed static so don''t free! */ + if ( cdata == NULL ) + tmem_page_free(pgp->obj->pool,pgp->pfp); + goto match; + } + } + + /* exited while loop with no match, so alloc a pcd and put it in the tree */ + if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL ) + { + ret = -ENOMEM; + goto unlock; + } else if ( cdata != NULL ) { + if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->obj->pool)) == NULL ) + { + tmem_free(pcd,sizeof(pcd_t),NULL); + ret = -ENOMEM; + goto unlock; + } + } + atomic_inc_and_max(global_pcd_count); + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */ + INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */ + pcd->pgp_ref_count = 0; + if ( cdata != NULL ) + { + memcpy(pcd->cdata,cdata,csize); + pcd->size = csize; + pcd_tot_csize += csize; + } else if ( pfp_size == 0 ) { + ASSERT(tmh_tze_enabled()); + pcd->size = 0; + pcd->tze = NULL; + } else if ( pfp_size < PAGE_SIZE && + ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->obj->pool)) != NULL) ) { + tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size); + pcd->size = pfp_size; + pcd_tot_tze_size += pfp_size; + tmem_page_free(pgp->obj->pool,pgp->pfp); + } else { + pcd->pfp = pgp->pfp; + pcd->size = PAGE_SIZE; + if ( tmh_tze_enabled() ) + pcd_tot_tze_size += PAGE_SIZE; + if ( tmh_compression_enabled() ) + pcd_tot_csize += PAGE_SIZE; + } + rb_link_node(&pcd->pcd_rb_tree_node, parent, new); + rb_insert_color(&pcd->pcd_rb_tree_node, root); + +match: + pcd->pgp_ref_count++; + list_add(&pgp->pcd_siblings,&pcd->pgp_list); + pgp->firstbyte = firstbyte; + pgp->eviction_attempted = 0; + pgp->pcd = pcd; + +unlock: + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return ret; +} + /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/ /* allocate a pgp_t and associate it with an object */ @@ -353,6 +602,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t * INIT_LIST_HEAD(&pgp->global_eph_pages); INIT_LIST_HEAD(&pgp->client_eph_pages); pgp->pfp = NULL; + if ( tmh_dedup_enabled() ) + { + pgp->firstbyte = NOT_SHAREABLE; + pgp->eviction_attempted = 0; + INIT_LIST_HEAD(&pgp->pcd_siblings); + } pgp->size = -1; pgp->index = -1; pgp->timestamp = get_cycles(); @@ -374,18 +629,20 @@ static pgp_t *pgp_lookup_in_obj(obj_t *o static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool) { + pagesize_t pgp_size = pgp->size; + if ( pgp->pfp == NULL ) return; - if ( !pgp->size ) + if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) + pcd_disassociate(pgp,pool,0); /* pgp->size lost */ + else if ( pgp_size ) + tmem_free(pgp->cdata,pgp_size,pool); + else tmem_page_free(pgp->obj->pool,pgp->pfp); - else + if ( pool != NULL && pgp_size ) { - tmem_free(pgp->cdata,pgp->size,pool); - if ( pool != NULL ) - { - pool->client->compressed_pages--; - pool->client->compressed_sum_size -= pgp->size; - } + pool->client->compressed_pages--; + pool->client->compressed_sum_size -= pgp_size; } pgp->pfp = NULL; pgp->size = -1; @@ -987,10 +1244,56 @@ static void client_freeze(client_t *clie /************ MEMORY REVOCATION ROUTINES *******************************/ +static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock) +{ + obj_t *obj = pgp->obj; + pool_t *pool = obj->pool; + client_t *client = pool->client; + uint16_t firstbyte = pgp->firstbyte; + + if ( pool->is_dying ) + return 0; + if ( tmh_lock_all && !obj->no_evict ) + return 1; + if ( tmem_spin_trylock(&obj->obj_spinlock) ) + { + if ( tmh_dedup_enabled() ) + { + firstbyte = pgp->firstbyte; + if ( firstbyte == NOT_SHAREABLE ) + goto obj_unlock; + ASSERT(firstbyte < 256); + if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) ) + goto obj_unlock; + if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted ) + { + pgp->eviction_attempted++; + list_del(&pgp->global_eph_pages); + list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list); + list_del(&pgp->client_eph_pages); + list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list); + goto pcd_unlock; + } + } + if ( obj->pgp_count > 1 ) + return 1; + if ( tmem_write_trylock(&pool->pool_rwlock) ) + { + *hold_pool_rwlock = 1; + return 1; + } +pcd_unlock: + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +obj_unlock: + tmem_spin_unlock(&obj->obj_spinlock); + } + return 0; +} + static int tmem_evict(void) { client_t *client = tmh_client_from_current(); - pgp_t *pgp = NULL, *pgp_del; + pgp_t *pgp = NULL, *pgp2, *pgp_del; obj_t *obj; pool_t *pool; int ret = 0; @@ -1001,49 +1304,15 @@ static int tmem_evict(void) if ( (client != NULL) && client_over_quota(client) && !list_empty(&client->ephemeral_page_list) ) { - list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages) - { - obj = pgp->obj; - pool = obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock = 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } else if ( list_empty(&global_ephemeral_page_list) ) { goto out; } else { - list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages) - { - obj = pgp->obj; - pool = obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock = 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } ret = 0; @@ -1057,10 +1326,16 @@ found: ASSERT(obj->no_evict == 0); ASSERT(obj->pool != NULL); ASSERT_SENTINEL(obj,OBJ); + pool = obj->pool; ASSERT_SPINLOCK(&obj->obj_spinlock); pgp_del = pgp_delete_from_obj(obj, pgp->index); ASSERT(pgp_del == pgp); + if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) + { + ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted); + pcd_disassociate(pgp,pool,1); + } pgp_delete(pgp,1); if ( obj->pgp_count == 0 ) { @@ -1129,25 +1404,30 @@ static NOINLINE int do_tmem_put_compress #ifdef __i386__ return -ENOMEM; #endif + if ( pgp->pfp != NULL ) - pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */ + pgp_free_data(pgp, pgp->obj->pool); START_CYC_COUNTER(compress); ret = tmh_compress_from_client(cmfn, &dst, &size, cva); if ( (ret == -EFAULT) || (ret == 0) ) goto out; - else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) + else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) { ret = 0; - else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) + goto out; + } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) { + if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM ) + goto out; + } else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) { ret = -ENOMEM; - else - { + goto out; + } else { memcpy(p,dst,size); pgp->cdata = p; - pgp->size = size; - pgp->obj->pool->client->compressed_pages++; - pgp->obj->pool->client->compressed_sum_size += size; - ret = 1; } + pgp->size = size; + pgp->obj->pool->client->compressed_pages++; + pgp->obj->pool->client->compressed_sum_size += size; + ret = 1; out: END_CYC_COUNTER(compress); @@ -1155,7 +1435,7 @@ out: } static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva) { pool_t *pool; obj_t *obj; @@ -1197,6 +1477,11 @@ copy_uncompressed: ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0); if ( ret == -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + { + if ( pcd_associate(pgp,NULL,0) == -ENOMEM ) + goto failed_dup; + } pgp->size = 0; done: @@ -1239,8 +1524,8 @@ failed_dup: static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva) { obj_t *obj = NULL, *objfound = NULL, *objnew = NULL; pgp_t *pgp = NULL, *pgpdel = NULL; @@ -1308,13 +1593,18 @@ copy_uncompressed: copy_uncompressed: if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL ) { - ret == -ENOMEM; + ret = -ENOMEM; goto delete_and_free; } /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */ ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva); if ( ret == -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + { + if ( pcd_associate(pgp,NULL,0) == -ENOMEM ) + goto delete_and_free; + } pgp->size = 0; insert_page: @@ -1376,8 +1666,8 @@ ASSERT(0); } static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva) { obj_t *obj; pgp_t *pgp; @@ -1404,15 +1694,18 @@ static NOINLINE int do_tmem_get(pool_t * return 0; } ASSERT(pgp->size != -1); - if ( pgp->size != 0 ) + if ( tmh_dedup_enabled() && !is_persistent(pool) && + pgp->firstbyte != NOT_SHAREABLE ) { + if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT ) + goto bad_copy; + } else if ( pgp->size != 0 ) { START_CYC_COUNTER(decompress); if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size, cva) == -EFAULT ) goto bad_copy; END_CYC_COUNTER(decompress); - } - else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, + } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, pfn_offset, len, cva) == -EFAULT) goto bad_copy; if ( is_ephemeral(pool) ) @@ -1855,11 +2148,15 @@ static int tmemc_list_global(tmem_cli_va total_flush_pool, use_long ? '','' : ''\n''); if (use_long) n += scnprintf(info+n,BSIZE-n, - "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n", + "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d," + "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Zt:%lu,Gz:%lu\n", global_eph_count, global_eph_count_max, _atomic_read(global_obj_count), global_obj_count_max, _atomic_read(global_rtree_node_count), global_rtree_node_count_max, - _atomic_read(global_pgp_count), global_pgp_count_max); + _atomic_read(global_pgp_count), global_pgp_count_max, + _atomic_read(global_page_count), global_page_count_max, + _atomic_read(global_pcd_count), global_pcd_count_max, + pcd_tot_tze_size,pcd_tot_csize); if ( sum + n >= len ) return sum; tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); @@ -1912,6 +2209,13 @@ static int tmemc_set_var_one(client_t *c #ifdef __i386__ return -1; #endif + if ( tmh_dedup_enabled() ) + { + printk("tmem: compression %s for all %ss, cannot be changed " + "when tmem_dedup is enabled\n", + tmh_compression_enabled() ? "enabled" : "disabled",client_str); + return -1; + } client->compress = arg1 ? 1 : 0; printk("tmem: compression %s for %s=%d\n", arg1 ? "enabled" : "disabled",cli_id_str,cli_id); @@ -2569,14 +2873,28 @@ EXPORT void *tmem_relinquish_pages(unsig /* called at hypervisor startup */ EXPORT void init_tmem(void) { + int i; if ( !tmh_enabled() ) return; radix_tree_init(); + if ( tmh_dedup_enabled() ) + for (i = 0; i < 256; i++ ) + { + pcd_tree_roots[i] = RB_ROOT; + rwlock_init(&pcd_tree_rwlocks[i]); + } + if ( tmh_init() ) { - printk("tmem: initialized comp=%d global-lock=%d\n", - tmh_compression_enabled(), tmh_lock_all); + printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n", + tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(), + tmh_lock_all); + if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled() ) + { + tmh_tze_disable(); + printk("tmem: tze and compression not compatible, disabling tze\n"); + } tmem_initialized = 1; } else diff -r b8d2a4134a68 xen/common/tmem_xen.c --- a/xen/common/tmem_xen.c Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem_xen.c Thu Mar 18 14:40:50 2010 -0600 @@ -17,8 +17,14 @@ EXPORT int opt_tmem = 0; EXPORT int opt_tmem = 0; boolean_param("tmem", opt_tmem); -EXPORT int opt_tmem_compress = 0; +EXPORT int opt_tmem_compress = 1; boolean_param("tmem_compress", opt_tmem_compress); + +EXPORT int opt_tmem_dedup = 1; +boolean_param("tmem_dedup", opt_tmem_dedup); + +EXPORT int opt_tmem_tze = 0; +boolean_param("tmem_tze", opt_tmem_tze); EXPORT int opt_tmem_shared_auth = 0; boolean_param("tmem_shared_auth", opt_tmem_shared_auth); @@ -103,8 +109,8 @@ static inline void *cli_mfn_to_va(tmem_c #endif EXPORT int tmh_copy_from_client(pfp_t *pfp, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cli_va) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cli_va) { unsigned long tmem_mfn; void *tmem_va; @@ -148,7 +154,7 @@ EXPORT int tmh_compress_from_client(tmem } EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cli_va) { unsigned long tmem_mfn, cli_mfn = 0; int mark_dirty = 1; @@ -195,6 +201,27 @@ EXPORT int tmh_decompress_to_client(tmem unmap_domain_page(cli_va); paging_mark_dirty(current->domain,cli_mfn); } + mb(); + return 1; +} + +EXPORT int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, + pagesize_t len) +{ + void *cli_va; + unsigned long cli_mfn; + + ASSERT(!(len & (sizeof(uint64_t)-1))); + ASSERT(len <= PAGE_SIZE); + ASSERT(len > 0 || tmem_va == NULL); + if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL) + return -EFAULT; + if ( len > 0 ) + memcpy((char *)cli_va,(char *)tmem_va,len); + if ( len < PAGE_SIZE ) + memset((char *)cli_va+len,0,PAGE_SIZE-len); + unmap_domain_page(cli_va); + paging_mark_dirty(current->domain,cli_mfn); mb(); return 1; } diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h --- a/xen/include/xen/tmem_xen.h Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/include/xen/tmem_xen.h Thu Mar 18 14:40:50 2010 -0600 @@ -26,6 +26,8 @@ struct tmem_host_dependent_client { }; typedef struct tmem_host_dependent_client tmh_client_t; +typedef uint32_t pagesize_t; /* like size_t, must handle largest PAGE_SIZE */ + #define IS_PAGE_ALIGNED(addr) \ ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr) #define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) ) @@ -52,6 +54,23 @@ static inline int tmh_compression_enable static inline int tmh_compression_enabled(void) { return opt_tmem_compress; +} + +extern int opt_tmem_dedup; +static inline int tmh_dedup_enabled(void) +{ + return opt_tmem_dedup; +} + +extern int opt_tmem_tze; +static inline int tmh_tze_enabled(void) +{ + return opt_tmem_tze; +} + +static inline void tmh_tze_disable(void) +{ + opt_tmem_tze = 0; } extern int opt_tmem_shared_auth; @@ -326,6 +345,101 @@ static inline bool_t tmh_current_is_priv return IS_PRIV(current->domain); } +static inline uint8_t tmh_get_first_byte(pfp_t *pfp) +{ + void *p = __map_domain_page(pfp); + + return (uint8_t)(*(char *)p); +} + +static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2) +{ + const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1); + const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2); + int i; + + // FIXME: code in assembly? +ASSERT(p1 != NULL); +ASSERT(p2 != NULL); + for ( i = PAGE_SIZE/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2, pagesize_t len2) +{ + const char *p1 = (char *)va1; + const char *p2 = (char *)va2; + pagesize_t i; + + ASSERT(len1 <= PAGE_SIZE); + ASSERT(len2 <= PAGE_SIZE); + if ( len1 < len2 ) + return -1; + if ( len1 > len2 ) + return 1; + ASSERT(len1 == len2); + for ( i = len2; i && *p1 == *p2; i--, *p1++, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *tva, pagesize_t tze_len) +{ + const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1); + const uint64_t *p2; + pagesize_t i; + + if ( tze_len == PAGE_SIZE ) + p2 = (uint64_t *)__map_domain_page((pfp_t *)tva); + else + p2 = (uint64_t *)tva; + ASSERT(pfp_len <= PAGE_SIZE); + ASSERT(!(pfp_len & (sizeof(uint64_t)-1))); + ASSERT(tze_len <= PAGE_SIZE); + ASSERT(!(tze_len & (sizeof(uint64_t)-1))); + if ( pfp_len < tze_len ) + return -1; + if ( pfp_len > tze_len ) + return 1; + ASSERT(pfp_len == tze_len); + for ( i = tze_len/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +/* return the size of the data in the pfp, ignoring trailing zeroes and + * rounded up to the nearest multiple of 8 */ +static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp) +{ + const uint64_t *p = (uint64_t *)__map_domain_page(pfp); + pagesize_t bytecount = PAGE_SIZE; + pagesize_t len = PAGE_SIZE/sizeof(uint64_t); + p += len; + while ( len-- && !*--p ) + bytecount -= sizeof(uint64_t); + return bytecount; +} + +static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t len) +{ + uint64_t *p1 = (uint64_t *)tva; + const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp); + + pagesize_t i; + ASSERT(!(len & (sizeof(uint64_t)-1))); + for ( i = len/sizeof(uint64_t); i--; *p1++ = *p2++); +} + /* these typedefs are in the public/tmem.h interface typedef XEN_GUEST_HANDLE(void) cli_mfn_t; typedef XEN_GUEST_HANDLE(char) cli_va_t; @@ -378,11 +492,13 @@ extern int tmh_compress_from_client(tmem extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*); extern int tmh_copy_from_client(pfp_t *pfp, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva); + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva); extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva); + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva); + +extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, pagesize_t len); #define TMEM_PERF _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel