Committed.
CVS mail might not have been sent.
On Sat, Mar 29, 2003 at 04:56:30PM -0800, jw schultz
wrote:> Mark II of the patch set.
>
> The first patch (dynsumlen2.patch) increments the protocol
> version to support per-file dynamic block checksum sizes.
> It is a prerequisite for varsumlen2.patch.
>
> varsumlen2.patch implements per-file dynamic block and checksum
> sizes.
>
> The current block size calculation only applies to files
> between 7MB and 160MB setting the block size to 1/10,0000 of
> the file length for a fixed count of 10,001 blocks.
> Starting at 160MB the block size would stay at 16KB and the
> count would be 1/15,536 of the file length. A logic error
> prevents --block-size=700 from locking the block size.
>
> While an improvement over fixed 700 byte blocks for very
> large files the block count would grow linearly with
> file size to outrageous numbers (65536/GB). An iso image
> would need over 40,000 blocks and with the fixed sum size of
> 4+2 bytes had a measurable sum collision rate. With the sum
> collisions the file would have to be redone with 4+16 byte
> sums killing efficiency. Additionally all those blocks
> meant a good deal of processing to find matching blocks and
> a sizable memory and network load.
>
> varsumlen2.patch calculates the block size to be a rounded square
> root of the file size with a 700byte minimum. In this way
> block sizes and counts grow smoothly with file size for
> files larger than 478KB. A 1MB file will have 1024 1KB
> blocks and a 4GB file would have 65,536 64KB blocks.
> --block-size will override this regardless of what block
> size is specified.
>
> The variable checksum size takes block size into account
> using an approximation of the formula provided by Donovan
> Baarda on this list (blocksum_bits includes the 32bit sum1).
>
> blocksum_bits = BIAS + 2*log2(file_len) - log2(block_len)
>
> This should result in increased efficiency as files grow,
> reduce the likelihood of file-redo and raise the ceiling on
> how large a file can be processed by rsync. The one case
> where efficiency could be decreased would be huge files with
> many tiny changes scattered throughout.
>
> The first table illustrates this showing pertinent data for
> a sampling of file sizes. xmit sums shows the relative
> network load of just sending checksums. array_size is the
> memory footprint sums array which could compounded by the
> sum hash could be a limiting factor on the size file a given
> machine can reasonably handle.
>
> The second table shows the effect of --block-size=16384
> and is indicative of what setting a MAX_BLOCK_SIZE would do.
>
>
> file length block_len block_count s2length xmit sums array_size
> 50 700 1 2 6 36
> 831K 920 926 2 5556 32K
> 1439K 1208 1221 2 7326 42K
> 2493K 1592 1604 2 9624 56K
> 4317K 2096 2110 2 12K 74K
> 7475K 2760 2774 2 16K 97K
> 12M 3640 3642 2 21K 128K
> 21M 4784 4799 2 28K 168K
> 32M 5824 5835 3 39K 205K
> 56M 7664 7678 3 52K 269K
> 97M 9K 9K 3 69K 355K
> 168M 12K 12K 3 90K 467K
> 291M 17K 17K 3 119K 614K
> 504M 22K 22K 3 157K 808K
> 873M 29K 29K 3 206K 1064K
> 1513M 38K 38K 3 272K 1400K
> 2070M 45K 45K 4 366K 1647K
> 4195M 119K 35K 4 280K 1261K
> 16G 251K 65K 4 525K 2365K
> 66G 509K 133K 5 1198K 4795K
> 261G 1022K 262K 5 2358K 9434K
> 1033G 2047K 516K 5 4650K 18M
> 2092G 2047K 1046K 6 10M 36M
> 4239G 4095K 1060K 6 10M 37M
> 16T 8191K 2091K 6 20M 73M
> 64T 15M 4126K 6 40M 145M
> 130T 15M 8359K 7 89M 293M
>
> file length block_len block_count s2length xmit sums array_size
> 50 16K 1 2 6 36
> 65M 16K 4202 3 28K 147K
> 1063M 16K 66K 4 531K 2392K
> 16G 16K 1034K 5 9312K 36M
> 261G 16K 16M 6 163M 589M
> 4239G 16K 264M 7 2914M 9539M
> 64T 16K 30M 8 365M 1096M
>
> --
> ________________________________________________________________
> J.W. Schultz Pegasystems Technologies
> email address: jw@pegasys.ws
>
> Remember Cernan and Schmitt
> --- rsync.h Sat Mar 29 11:11:30 2003
> +++ rsync.h Sat Mar 29 12:15:15 2003
> @@ -50,7 +50,7 @@
> #define SAME_TIME (1<<7)
>
> /* update this if you make incompatible changes */
> -#define PROTOCOL_VERSION 26
> +#define PROTOCOL_VERSION 27
>
> /* We refuse to interoperate with versions that are not in this range.
> * Note that we assume we'll work with later versions: the onus is on
> @@ -406,7 +406,8 @@
> OFF_T flength; /**< total file length */
> size_t count; /**< how many chunks */
> size_t remainder; /**< flength % block_length */
> - size_t n; /**< block_length */
> + size_t blength; /**< block_length */
> + size_t s2length; /**< sum2_length */
> struct sum_buf *sums; /**< points to info for each chunk */
> };
>
> --- proto.h Sat Mar 29 12:18:02 2003
> +++ proto.h Sat Mar 29 12:15:38 2003
> @@ -91,6 +91,7 @@
> struct file_list *flist_new(void);
> void flist_free(struct file_list *flist);
> char *f_name(struct file_struct *f);
> +void write_sum_head(int f, struct sum_struct *sum);
> void recv_generator(char *fname, struct file_list *flist, int i, int
f_out);
> void generate_files(int f,struct file_list *flist,char *local_name,int
f_recv);
> int main(int argc, char *argv[]);
> @@ -190,6 +191,7 @@
> int report);
> void sig_int(void);
> void finish_transfer(char *fname, char *fnametmp, struct file_struct
*file);
> +void read_sum_head(int f, struct sum_struct *sum);
> void send_files(struct file_list *flist,int f_out,int f_in);
> int try_bind_local(int s,
> int ai_family, int ai_socktype,
> --- generator.c Sat Mar 29 11:11:30 2003
> +++ generator.c Sat Mar 29 12:16:02 2003
> @@ -116,13 +116,21 @@
>
>
> /*
> - send a header that says "we have no checksums" down the f_out
fd
> + * NULL sum_struct means we have no checksums
> */
> -static void send_null_sums(int f_out)
> +
> +void write_sum_head(int f, struct sum_struct *sum)
> {
> - write_int(f_out, 0);
> - write_int(f_out, block_size);
> - write_int(f_out, 0);
> + static struct sum_struct null_sum;
> +
> + if (sum == (struct sum_struct *)NULL)
> + sum = &null_sum;
> +
> + write_int(f, sum->count);
> + write_int(f, sum->blength);
> + if (remote_version >= 27)
> + write_int(f, sum->s2length);
> + write_int(f, sum->remainder);
> }
>
>
> @@ -164,19 +172,18 @@
>
> sum.count = (len + (block_len - 1)) / block_len;
> sum.remainder = (len % block_len);
> - sum.n = block_len;
> + sum.blength = block_len;
> sum.flength = len;
> + sum.s2length = csum_length;
> /* not needed here sum.sums = NULL; */
>
> if (sum.count && verbose > 3) {
> rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
> (long) sum.count, (long) sum.remainder,
> - (long) sum.n, (double) sum.flength);
> + (long) sum.blength, (double) sum.flength);
> }
>
> - write_int(f_out, sum.count);
> - write_int(f_out, sum.n);
> - write_int(f_out, sum.remainder);
> + write_sum_head(f_out, &sum);
>
> for (i = 0; i < sum.count; i++) {
> int n1 = MIN(len, block_len);
> @@ -192,7 +199,7 @@
> i, (double) offset, n1, (unsigned long) sum1);
> }
> write_int(f_out, sum1);
> - write_buf(f_out, sum2, csum_length);
> + write_buf(f_out, sum2, sum.s2length);
> len -= n1;
> offset += n1;
> }
> @@ -384,7 +391,7 @@
> if (statret == -1) {
> if (errno == ENOENT) {
> write_int(f_out,i);
> - if (!dry_run) send_null_sums(f_out);
> + if (!dry_run) write_sum_head(f_out, NULL);
> } else {
> if (verbose > 1)
> rprintf(FERROR, RSYNC_NAME
> @@ -401,7 +408,7 @@
>
> /* now pretend the file didn't exist */
> write_int(f_out,i);
> - if (!dry_run) send_null_sums(f_out);
> + if (!dry_run) write_sum_head(f_out, NULL);
> return;
> }
>
> @@ -430,7 +437,7 @@
>
> if (disable_deltas_p()) {
> write_int(f_out,i);
> - send_null_sums(f_out);
> + write_sum_head(f_out, NULL);
> return;
> }
>
> @@ -441,7 +448,7 @@
> rprintf(FERROR,RSYNC_NAME": failed to open \"%s\",
continuing : %s\n",fnamecmp,strerror(errno));
> /* pretend the file didn't exist */
> write_int(f_out,i);
> - send_null_sums(f_out);
> + write_sum_head(f_out, NULL);
> return;
> }
>
> --- receiver.c Sat Mar 29 11:11:30 2003
> +++ receiver.c Sat Mar 29 12:16:23 2003
> @@ -230,7 +230,8 @@
> OFF_T total_size)
> {
> int i;
> - unsigned int n,remainder,len,count;
> + struct sum_struct sum;
> + unsigned int len;
> OFF_T offset = 0;
> OFF_T offset2;
> char *data;
> @@ -238,9 +239,7 @@
> static char file_sum2[MD4_SUM_LENGTH];
> char *map=NULL;
>
> - count = read_int(f_in);
> - n = read_int(f_in);
> - remainder = read_int(f_in);
> + read_sum_head(f_in, &sum);
>
> sum_init();
>
> @@ -270,10 +269,10 @@
> }
>
> i = -(i+1);
> - offset2 = i*(OFF_T)n;
> - len = n;
> - if (i == (int) count-1 && remainder != 0)
> - len = remainder;
> + offset2 = i*(OFF_T)sum.blength;
> + len = sum.blength;
> + if (i == (int) sum.count-1 && sum.remainder != 0)
> + len = sum.remainder;
>
> stats.matched_data += len;
>
> --- sender.c Sat Mar 29 11:11:30 2003
> +++ sender.c Sat Mar 29 12:16:32 2003
> @@ -37,6 +37,19 @@
> **/
>
>
> +void read_sum_head(int f, struct sum_struct *sum)
> +{
> + sum->count = read_int(f);
> + sum->blength = read_int(f);
> + if (remote_version < 27)
> + {
> + sum->s2length = csum_length;
> + } else {
> + sum->s2length = read_int(f);
> + }
> + sum->remainder = read_int(f);
> +}
> +
> /**
> * Receive the checksums for a buffer
> **/
> @@ -49,14 +62,14 @@
> s = (struct sum_struct *)malloc(sizeof(*s));
> if (!s) out_of_memory("receive_sums");
>
> - s->count = read_int(f);
> - s->n = read_int(f);
> - s->remainder = read_int(f);
> + read_sum_head(f, s);
> +
> s->sums = NULL;
>
> if (verbose > 3)
> rprintf(FINFO,"count=%ld n=%ld rem=%ld\n",
> - (long) s->count, (long) s->n, (long) s->remainder);
> + (long) s->count, (long) s->blength,
> + (long) s->remainder);
>
> if (s->count == 0)
> return(s);
> @@ -66,7 +79,7 @@
>
> for (i=0; i < (int) s->count;i++) {
> s->sums[i].sum1 = read_int(f);
> - read_buf(f,s->sums[i].sum2,csum_length);
> + read_buf(f,s->sums[i].sum2,s->s2length);
>
> s->sums[i].offset = offset;
> s->sums[i].i = i;
> @@ -74,7 +87,7 @@
> if (i == (int) s->count-1 && s->remainder != 0) {
> s->sums[i].len = s->remainder;
> } else {
> - s->sums[i].len = s->n;
> + s->sums[i].len = s->blength;
> }
> offset += s->sums[i].len;
>
> @@ -211,9 +224,7 @@
> if (write_batch)
> write_batch_delta_file((char *)&i,sizeof(i));
>
> - write_int(f_out,s->count);
> - write_int(f_out,s->n);
> - write_int(f_out,s->remainder);
> + write_sum_head(f_out, s);
> }
>
> if (verbose > 2)
> @@ -240,9 +251,7 @@
> }
> else {
> write_int(f_out,j);
> - write_int(f_out,s->count);
> - write_int(f_out,s->n);
> - write_int(f_out,s->remainder);
> + write_sum_head(f_out, s);
> done=0;
> while (!done) {
> read_batch_delta_file( (char *) &buff_len,
sizeof(int) );
> --- match.c Sat Mar 29 11:11:30 2003
> +++ match.c Sat Mar 29 12:16:13 2003
> @@ -19,8 +19,6 @@
>
> #include "rsync.h"
>
> -extern int csum_length;
> -
> extern int verbose;
> extern int am_server;
>
> @@ -154,11 +152,11 @@
>
> if (verbose > 2)
> rprintf(FINFO,"hash search b=%ld len=%.0f\n",
> - (long) s->n, (double)len);
> + (long) s->blength, (double)len);
>
> - /* cast is to make s->n signed; it should always be reasonably
> + /* cast is to make s->blength signed; it should always be reasonably
> * small */
> - k = MIN(len, (OFF_T) s->n);
> + k = MIN(len, (OFF_T) s->blength);
>
> map = (schar *)map_ptr(buf,0,k);
>
> @@ -173,8 +171,8 @@
> end = len + 1 - s->sums[s->count-1].len;
>
> if (verbose > 3)
> - rprintf(FINFO, "hash search s->n=%ld len=%.0f count=%ld\n",
> - (long) s->n, (double) len, (long) s->count);
> + rprintf(FINFO, "hash search s->blength=%ld len=%.0f
count=%ld\n",
> + (long) s->blength, (double) len, (long) s->count);
>
> do {
> tag t = gettag2(s1,s2);
> @@ -196,7 +194,7 @@
> if (sum != s->sums[i].sum1) continue;
>
> /* also make sure the two blocks are the same length */
> - l = MIN(s->n,len-offset);
> + l = MIN(s->blength,len-offset);
> if (l != s->sums[i].len) continue;
>
> if (verbose > 3)
> @@ -209,7 +207,7 @@
> done_csum2 = 1;
> }
>
> - if (memcmp(sum2,s->sums[i].sum2,csum_length) != 0) {
> + if (memcmp(sum2,s->sums[i].sum2,s->s2length) != 0) {
> false_alarms++;
> continue;
> }
> @@ -220,7 +218,7 @@
> int i2 = targets[j].i;
> if (i2 == last_i + 1) {
> if (sum != s->sums[i2].sum1) break;
> - if (memcmp(sum2,s->sums[i2].sum2,csum_length) != 0) break;
> + if (memcmp(sum2,s->sums[i2].sum2,s->s2length) != 0) break;
> /* we've found an adjacent match - the RLL coder
> will be happy */
> i = i2;
> @@ -232,7 +230,7 @@
>
> matched(f,s,buf,offset,i);
> offset += s->sums[i].len - 1;
> - k = MIN((len-offset), s->n);
> + k = MIN((len-offset), s->blength);
> map = (schar *)map_ptr(buf,offset,k);
> sum = get_checksum1((char *)map, k);
> s1 = sum & 0xFFFF;
> @@ -262,9 +260,9 @@
> running match, the checksum update and the
> literal send. */
> if (offset > last_match &&
> - offset-last_match >= CHUNK_SIZE+s->n &&
> + offset-last_match >= CHUNK_SIZE+s->blength &&
> (end-offset > CHUNK_SIZE)) {
> - matched(f,s,buf,offset - s->n, -2);
> + matched(f,s,buf,offset - s->blength, -2);
> }
> } while (++offset < end);
>
> --- rsync.h Sat Mar 29 12:24:35 2003
> +++ rsync.h Sat Mar 29 15:11:04 2003
> @@ -339,6 +339,8 @@
> /* the length of the md4 checksum */
> #define MD4_SUM_LENGTH 16
> #define SUM_LENGTH 16
> +#define SHORT_SUM_LENGTH 2
> +#define BLOCKSUM_BIAS 10
>
> #ifndef MAXPATHLEN
> #define MAXPATHLEN 1024
> --- options.c Sat Mar 29 12:25:34 2003
> +++ options.c Sat Mar 29 15:10:42 2003
> @@ -72,7 +72,7 @@
> int keep_partial=0;
> int safe_symlinks=0;
> int copy_unsafe_links=0;
> -int block_size=BLOCK_SIZE;
> +int block_size=0;
> int size_only=0;
> int bwlimit=0;
> int delete_after=0;
> @@ -725,7 +725,7 @@
>
> if (x != 1) args[ac++] = argstr;
>
> - if (block_size != BLOCK_SIZE) {
> + if (block_size) {
> snprintf(bsize,sizeof(bsize),"-B%d",block_size);
> args[ac++] = bsize;
> }
> --- generator.c Sat Mar 29 12:24:49 2003
> +++ generator.c Sat Mar 29 16:21:55 2003
> @@ -32,7 +32,6 @@
> extern int preserve_hard_links;
> extern int update_only;
> extern int opt_ignore_existing;
> -extern int block_size;
> extern int csum_length;
> extern int ignore_times;
> extern int size_only;
> @@ -100,24 +99,9 @@
> }
>
>
> -/* use a larger block size for really big files */
> -static int adapt_block_size(struct file_struct *file, int bsize)
> -{
> - int ret;
> -
> - if (bsize != BLOCK_SIZE) return bsize;
> -
> - ret = file->length / (10000); /* rough heuristic */
> - ret = ret & ~15; /* multiple of 16 */
> - if (ret < bsize) ret = bsize;
> - if (ret > CHUNK_SIZE/2) ret = CHUNK_SIZE/2;
> - return ret;
> -}
> -
> -
> /*
> * NULL sum_struct means we have no checksums
> - */
> + */
>
> void write_sum_head(int f, struct sum_struct *sum)
> {
> @@ -133,7 +117,87 @@
> write_int(f, sum->remainder);
> }
>
> +/*
> + * set (initialize) the size entries in the per-file sum_struct
> + * calulating dynamic block ans checksum sizes.
> + *
> + * This is only called from generate_and_send_sums() but is a seperate
> + * function to encapsulate the logic.
> + *
> + * The block size is a rounded square root of file length.
> + *
> + * The checksum size is determined according to:
> + * blocksum_bits = BLOCKSUM_EXP + 2*log2(file_len) - log2(block_len)
> + * provided by Donovan Baarda which gives a probability of rsync
> + * algorithm corrupting data and falling back using the whole md4
> + * checksums.
> + *
> + * This might be made one of several selectable heuristics.
> + */
>
> +static void sum_sizes_sqroot_baarda(struct sum_struct *sum, uint64 len)
> +{
> + extern int block_size;
> + int blength, s2length, b;
> + uint32 c;
> + uint64 l;
> +
> + if (block_size) {
> + blength = block_size;
> + } else if (len <= BLOCK_SIZE * BLOCK_SIZE) {
> + blength = BLOCK_SIZE;
> + } else {
> + l = len;
> + c = 1;
> + while (l >>= 2) {
> + c <<= 1;
> + }
> + blength = 0;
> + do {
> + blength |= c;
> + if (len < (uint64)(blength * blength))
> + blength &= ~c;
> + c >>= 1;
> + } while (c >= 8); /* round to multiple of 8 */
> + blength = MAX(blength, BLOCK_SIZE);
> + }
> +
> + if (remote_version < 27) {
> + s2length = csum_length;
> + } else if (csum_length == SUM_LENGTH) {
> + s2length = SUM_LENGTH;
> + } else {
> + b = BLOCKSUM_BIAS;
> + l = len;
> + while (l >>= 1) {
> + b += 2;
> + }
> + c = blength;
> + while (c >>= 1 && b) {
> + b--;
> + }
> + s2length = (b + 1 - 32 + 7) / 8; /* add a bit,
> + * subtract rollsum,
> + * round up
> + * --optimize in compiler--
> + */
> + s2length = MAX(s2length, csum_length);
> + s2length = MIN(s2length, SUM_LENGTH);
> + }
> +
> + sum->flength = len;
> + sum->blength = blength;
> + sum->s2length = s2length;
> + sum->count = (len + (blength - 1)) / blength;
> + sum->remainder = (len % blength);
> +
> + if (sum->count && verbose > 2) {
> + rprintf(FINFO, "count=%ld rem=%ld blength=%ld s2length=%ld
flength=%.0f\n",
> + (long) sum->count, (long) sum->remainder,
> + (long) sum->blength, (long) sum->s2length,
> + (double) sum->flength);
> + }
> +}
>
> /**
> * Perhaps we want to just send an empty checksum set for this file,
> @@ -163,30 +227,18 @@
> *
> * Generate approximately one checksum every block_len bytes.
> */
> -static void generate_and_send_sums(struct map_struct *buf, OFF_T len,
> - int block_len, int f_out)
> +static void generate_and_send_sums(struct map_struct *buf, OFF_T len, int
f_out)
> {
> size_t i;
> struct sum_struct sum;
> OFF_T offset = 0;
>
> - sum.count = (len + (block_len - 1)) / block_len;
> - sum.remainder = (len % block_len);
> - sum.blength = block_len;
> - sum.flength = len;
> - sum.s2length = csum_length;
> - /* not needed here sum.sums = NULL; */
> -
> - if (sum.count && verbose > 3) {
> - rprintf(FINFO, "count=%ld rem=%ld n=%ld flength=%.0f\n",
> - (long) sum.count, (long) sum.remainder,
> - (long) sum.blength, (double) sum.flength);
> - }
> + sum_sizes_sqroot_baarda(&sum, len);
>
> write_sum_head(f_out, &sum);
>
> for (i = 0; i < sum.count; i++) {
> - int n1 = MIN(len, block_len);
> + int n1 = MIN(len, sum.blength);
> char *map = map_ptr(buf, offset, n1);
> uint32 sum1 = get_checksum1(map, n1);
> char sum2[SUM_LENGTH];
> @@ -465,8 +517,7 @@
> rprintf(FINFO, "generating and sending sums for %d\n", i);
>
> write_int(f_out,i);
> - generate_and_send_sums(buf, st.st_size,
> - adapt_block_size(file, block_size), f_out);
> + generate_and_send_sums(buf, st.st_size, f_out);
>
> close(fd);
> if (buf) unmap_file(buf);
> --
> To unsubscribe or change options:
http://lists.samba.org/mailman/listinfo/rsync
> Before posting, read: http://www.tuxedo.org/~esr/faqs/smart-questions.html
--
________________________________________________________________
J.W. Schultz Pegasystems Technologies
email address: jw@pegasys.ws
Remember Cernan and Schmitt