On 5 Jan 2005, Rusty Russell <rusty@rustcorp.com.au>
wrote:> On Tue, 2005-01-04 at 18:24 +0100, Robert Lemmen wrote:
> > hi rusty,
> >
> > i read on some webpage about rsync and debian that you wrote a patch
to
> > rsync that let's it uses heuristics when deciding which local file
to
> > use. could you tell me whether this is planned to be included in a
rsync
> > release? could i have that patch?
>
> Hmm, good question. This is from 2.5.4, and can't remember how well it
> worked. Good luck!
I'm not the rsync maintainer anymore, but I think it would be cool if
this were merged, if the current team feels OK about it.
>
> Rusty.
>
> diff -urN rsync-2.5.4/Makefile.in rsync-2.5.4-fuzzy/Makefile.in
> --- rsync-2.5.4/Makefile.in 2002-02-26 05:48:25.000000000 +1100
> +++ rsync-2.5.4-fuzzy/Makefile.in 2002-04-03 16:35:55.000000000 +1000
> @@ -28,7 +28,7 @@
> ZLIBOBJ=zlib/deflate.o zlib/infblock.o zlib/infcodes.o zlib/inffast.o \
> zlib/inflate.o zlib/inftrees.o zlib/infutil.o zlib/trees.o \
> zlib/zutil.o zlib/adler32.o
> -OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o
main.o checksum.o match.o syscall.o log.o backup.o
> +OBJS1=rsync.o generator.o receiver.o cleanup.o sender.o exclude.o util.o
main.o checksum.o match.o syscall.o log.o backup.o alternate.o
> OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o
fileio.o batch.o \
> clientname.o
> DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o
authenticate.o
> diff -urN rsync-2.5.4/alternate.c rsync-2.5.4-fuzzy/alternate.c
> --- rsync-2.5.4/alternate.c 1970-01-01 10:00:00.000000000 +1000
> +++ rsync-2.5.4-fuzzy/alternate.c 2002-04-03 17:04:15.000000000 +1000
> @@ -0,0 +1,117 @@
> +#include "rsync.h"
> +
> +extern char *compare_dest;
> +extern int verbose;
> +
> +/* Alternate methods for opening files, if local doesn't exist */
> +/* Sanity check that we are about to open regular file */
> +int do_open_regular(char *fname)
> +{
> + STRUCT_STAT st;
> +
> + if (do_stat(fname, &st) == 0 && S_ISREG(st.st_mode))
> + return do_open(fname, O_RDONLY, 0);
> +
> + return -1;
> +}
> +
> +static void split_names(char *fname, char **dirname, char **basename)
> +{
> + char *slash;
> +
> + slash = strrchr(fname, '/');
> + if (slash) {
> + *dirname = fname;
> + *slash = '\0';
> + *basename = slash+1;
> + } else {
> + *basename = fname;
> + *dirname = ".";
> + }
> +}
> +
> +static unsigned int measure_name(const char *name,
> + const char *basename,
> + const char *ext)
> +{
> + int namelen = strlen(name);
> + int extlen = strlen(ext);
> + unsigned int score = 0;
> +
> + /* Extensions must match */
> + if (namelen <= extlen || strcmp(name+namelen-extlen, ext) != 0)
> + return 0;
> +
> + /* Now score depends on similarity of prefix */
> + for (; *name==*basename && *name; name++, basename++)
> + score++;
> + return score;
> +}
> +
> +int open_alternate_base_fuzzy(const char *fname)
> +{
> + DIR *d;
> + struct dirent *di;
> + char *basename, *dirname;
> + char mangled_name[MAXPATHLEN];
> + char bestname[MAXPATHLEN];
> + unsigned int bestscore = 0;
> + const char *ext;
> +
> + /* FIXME: can we assume fname fits here? */
> + strcpy(mangled_name, fname);
> +
> + split_names(mangled_name, &dirname, &basename);
> + d = opendir(dirname);
> + if (!d) {
> + rprintf(FERROR,"recv_generator opendir(%s): %s\n",
> + dirname,strerror(errno));
> + return -1;
> + }
> +
> + /* Get final extension, eg. .gz; never full basename though. */
> + ext = strrchr(basename + 1, '.');
> + if (!ext)
> + ext = basename + strlen(basename); /* ext = "" */
> +
> + while ((di = readdir(d)) != NULL) {
> + const char *dname = d_name(di);
> + unsigned int score;
> +
> + if (strcmp(dname,".")==0 ||
> + strcmp(dname,"..")==0)
> + continue;
> +
> + score = measure_name(dname, basename, ext);
> + if (verbose > 4)
> + rprintf(FINFO,"fuzzy score for %s = %u\n",
> + dname, score);
> + if (score > bestscore) {
> + strcpy(bestname, dname);
> + bestscore = score;
> + }
> + }
> + closedir(d);
> +
> + /* Found a candidate. */
> + if (bestscore != 0) {
> + char fuzzyname[MAXPATHLEN];
> +
> + snprintf(fuzzyname,MAXPATHLEN,"%s/%s", dirname, bestname);
> + if (verbose > 2)
> + rprintf(FINFO,"fuzzy match %s->%s\n",
> + fname, fuzzyname);
> + return do_open_regular(fuzzyname);
> + }
> + return -1;
> +}
> +
> +int open_alternate_base_comparedir(const char *fname)
> +{
> + char fnamebuf[MAXPATHLEN];
> + /* try the file at compare_dest instead */
> + snprintf(fnamebuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
> +
> + /* FIXME: now follows symlinks... */
> + return do_open_regular(fnamebuf);
> +}
> diff -urN rsync-2.5.4/generator.c rsync-2.5.4-fuzzy/generator.c
> --- rsync-2.5.4/generator.c 2002-02-08 03:36:12.000000000 +1100
> +++ rsync-2.5.4-fuzzy/generator.c 2002-04-03 17:00:06.000000000 +1000
> @@ -42,11 +42,12 @@
> extern int always_checksum;
> extern int modify_window;
> extern char *compare_dest;
> +extern int fuzzy;
>
>
> /* choose whether to skip a particular file */
> static int skip_file(char *fname,
> - struct file_struct *file, STRUCT_STAT *st)
> + struct file_struct *file, const STRUCT_STAT *st)
> {
> if (st->st_size != file->length) {
> return 0;
> @@ -185,7 +186,61 @@
> return s;
> }
>
> +/* Returns -1 for can't open (null file), -2 for skip */
> +static int open_base_file(struct file_struct *file,
> + char *fname,
> + int statret,
> + STRUCT_STAT *st)
> +{
> + int fd = -1;
> +
> + if (statret == 0) {
> + if (S_ISREG(st->st_mode)) {
> + if (update_only
> + && cmp_modtime(st->st_mtime, file->modtime) > 0) {
> + if (verbose > 1)
> + rprintf(FINFO,"%s is newer\n",fname);
> + return -2;
> + }
> + if (skip_file(fname, file, st)) {
> + set_perms(fname, file, st, 1);
> + return -2;
> + }
> + fd = do_open(fname, O_RDONLY, 0);
> + if (fd == -1) {
> + rprintf(FERROR,"failed to open %s, continuing :
%s\n",fname,strerror(errno));
> + return -1;
> + } else
> + return fd;
> + } else {
> + /* Try to use symlink contents */
> + if (S_ISLNK(st->st_mode)) {
> + fd = do_open_regular(fname);
> + /* Don't delete yet; receiver will need it */
> + } else {
> + if (delete_file(fname) != 0) {
> + if (fd != -1)
> + close(fd);
> + return -2;
> + }
> + }
> + }
> + }
> +
> + if (fd == -1 && compare_dest != NULL)
> + fd = open_alternate_base_comparedir(fname);
>
> + if (fd == -1 && fuzzy)
> + fd = open_alternate_base_fuzzy(fname);
> +
> + /* Update stat to understand size */
> + if (fd != -1) {
> + if (do_fstat(fd, st) != 0)
> + rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
> + }
> +
> + return fd;
> +}
>
> /*
> * Acts on file number I from FLIST, whose name is fname.
> @@ -203,9 +258,6 @@
> struct sum_struct *s;
> int statret;
> struct file_struct *file = flist->files[i];
> - char *fnamecmp;
> - char fnamecmpbuf[MAXPATHLEN];
> - extern char *compare_dest;
> extern int list_only;
> extern int preserve_perms;
> extern int only_existing;
> @@ -341,82 +393,29 @@
> return;
> }
>
> - fnamecmp = fname;
> -
> - if ((statret == -1) && (compare_dest != NULL)) {
> - /* try the file at compare_dest instead */
> - int saveerrno = errno;
> - snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",compare_dest,fname);
> - statret = link_stat(fnamecmpbuf,&st);
> - if (!S_ISREG(st.st_mode))
> - statret = -1;
> - if (statret == -1)
> - errno = saveerrno;
> - else
> - fnamecmp = fnamecmpbuf;
> - }
> -
> - if (statret == -1) {
> - if (errno == ENOENT) {
> - write_int(f_out,i);
> - if (!dry_run) send_sums(NULL,f_out);
> - } else {
> - if (verbose > 1)
> - rprintf(FERROR, RSYNC_NAME
> - ": recv_generator failed to open \"%s\": %s\n",
> - fname, strerror(errno));
> - }
> - return;
> - }
> -
> - if (!S_ISREG(st.st_mode)) {
> - if (delete_file(fname) != 0) {
> - return;
> - }
> -
> - /* now pretend the file didn't exist */
> - write_int(f_out,i);
> - if (!dry_run) send_sums(NULL,f_out);
> - return;
> - }
> -
> - if (opt_ignore_existing && fnamecmp == fname) {
> - if (verbose > 1)
> - rprintf(FINFO,"%s exists\n",fname);
> - return;
> - }
> -
> - if (update_only && cmp_modtime(st.st_mtime,file->modtime)>0
&& fnamecmp == fname) {
> + /* Failed to stat for some other reason. */
> + if (statret == -1 && errno != ENOENT) {
> if (verbose > 1)
> - rprintf(FINFO,"%s is newer\n",fname);
> + rprintf(FERROR, RSYNC_NAME
> + ": recv_generator failed to open \"%s\": %s\n",
> + fname, strerror(errno));
> return;
> }
>
> - if (skip_file(fname, file, &st)) {
> - if (fnamecmp == fname)
> - set_perms(fname,file,&st,1);
> - return;
> - }
> -
> - if (dry_run) {
> - write_int(f_out,i);
> + fd = open_base_file(file, fname, statret, &st);
> + if (fd == -2)
> return;
> - }
> -
> - if (whole_file) {
> - write_int(f_out,i);
> - send_sums(NULL,f_out);
> - return;
> - }
> -
> - /* open the file */
> - fd = do_open(fnamecmp, O_RDONLY, 0);
>
> - if (fd == -1) {
> - rprintf(FERROR,RSYNC_NAME": failed to open \"%s\",
continuing : %s\n",fnamecmp,strerror(errno));
> - /* pretend the file didn't exist */
> + if ((whole_file || dry_run) && fd != -1) {
> + close(fd);
> + fd = -1;
> + }
> +
> + if (fd == -1) {
> + /* the file didn't exist, or we can pretend it doesn't */
> write_int(f_out,i);
> - send_sums(NULL,f_out);
> + if (!dry_run)
> + send_sums(NULL,f_out);
> return;
> }
>
> @@ -427,7 +426,7 @@
> }
>
> if (verbose > 3)
> - rprintf(FINFO,"gen mapped %s of size
%.0f\n",fnamecmp,(double)st.st_size);
> + rprintf(FINFO,"gen mapped %s of size
%.0f\n",fname,(double)st.st_size);
>
> s = generate_sums(buf,st.st_size,adapt_block_size(file, block_size));
>
> diff -urN rsync-2.5.4/options.c rsync-2.5.4-fuzzy/options.c
> --- rsync-2.5.4/options.c 2002-02-28 09:49:57.000000000 +1100
> +++ rsync-2.5.4-fuzzy/options.c 2002-04-03 16:43:54.000000000 +1000
> @@ -73,6 +73,7 @@
> #else
> int modify_window=0;
> #endif
> +int fuzzy=0;
> int blocking_io=-1;
>
> /** Network address family. **/
> @@ -245,6 +246,7 @@
> rprintf(F," --bwlimit=KBPS limit I/O bandwidth, KBytes
per second\n");
> rprintf(F," --write-batch=PREFIX write batch fileset
starting with PREFIX\n");
> rprintf(F," --read-batch=PREFIX read batch fileset starting
with PREFIX\n");
> + rprintf(F," --fuzzy use similar file as basis if it
does't exist\n");
> rprintf(F," -h, --help show this help
screen\n");
> #ifdef INET6
> rprintf(F," -4 prefer IPv4\n");
> @@ -340,6 +342,7 @@
> {"hard-links", 'H', POPT_ARG_NONE,
&preserve_hard_links},
> {"read-batch", 0, POPT_ARG_STRING, &batch_prefix,
OPT_READ_BATCH},
> {"write-batch", 0, POPT_ARG_STRING, &batch_prefix,
OPT_WRITE_BATCH},
> + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy},
> #ifdef INET6
> {0, '4', POPT_ARG_VAL, &default_af_hint, AF_INET
},
> {0, '6', POPT_ARG_VAL, &default_af_hint,
AF_INET6 },
> @@ -757,7 +760,9 @@
> args[ac++] = "--compare-dest";
> args[ac++] = compare_dest;
> }
> -
> +
> + if (fuzzy && am_sender)
> + args[ac++] = "--fuzzy";
>
> *argc = ac;
> }
> diff -urN rsync-2.5.4/proto.h rsync-2.5.4-fuzzy/proto.h
> --- rsync-2.5.4/proto.h 2002-02-23 11:05:06.000000000 +1100
> +++ rsync-2.5.4-fuzzy/proto.h 2002-04-03 16:35:25.000000000 +1000
> @@ -256,3 +256,6 @@
> int cmp_modtime(time_t file1, time_t file2);
> int _Insure_trap_error(int a1, int a2, int a3, int a4, int a5, int a6);
> int sys_gettimeofday(struct timeval *tv);
> +int do_open_regular(char *fname);
> +int open_alternate_base_fuzzy(const char *fname);
> +int open_alternate_base_comparedir(const char *fname);
> diff -urN rsync-2.5.4/receiver.c rsync-2.5.4-fuzzy/receiver.c
> --- rsync-2.5.4/receiver.c 2002-02-14 05:42:20.000000000 +1100
> +++ rsync-2.5.4-fuzzy/receiver.c 2002-04-03 16:46:46.000000000 +1000
> @@ -36,6 +36,7 @@
> extern char *compare_dest;
> extern int make_backups;
> extern char *backup_suffix;
> +extern int fuzzy;
>
> static struct delete_list {
> DEV64_T dev;
> @@ -307,8 +308,6 @@
> char *fname;
> char template[MAXPATHLEN];
> char fnametmp[MAXPATHLEN];
> - char *fnamecmp;
> - char fnamecmpbuf[MAXPATHLEN];
> struct map_struct *buf;
> int i;
> struct file_struct *file;
> @@ -366,28 +365,24 @@
> if (verbose > 2)
> rprintf(FINFO,"recv_files(%s)\n",fname);
>
> - fnamecmp = fname;
> -
> /* open the file */
> - fd1 = do_open(fnamecmp, O_RDONLY, 0);
> + fd1 = do_open(fname, O_RDONLY, 0);
>
> - if ((fd1 == -1) && (compare_dest != NULL)) {
> - /* try the file at compare_dest instead */
> - snprintf(fnamecmpbuf,MAXPATHLEN,"%s/%s",
> - compare_dest,fname);
> - fnamecmp = fnamecmpbuf;
> - fd1 = do_open(fnamecmp, O_RDONLY, 0);
> - }
> + if (fd1 == -1 && compare_dest != NULL)
> + fd1 = open_alternate_base_comparedir(fname);
> +
> + if (fd1 == -1 && fuzzy)
> + fd1 = open_alternate_base_fuzzy(fname);
>
> if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
> - rprintf(FERROR,"fstat %s : %s\n",fnamecmp,strerror(errno));
> + rprintf(FERROR,"fstat %s : %s\n",fname,strerror(errno));
> receive_data(f_in,NULL,-1,NULL,file->length);
> close(fd1);
> continue;
> }
>
> if (fd1 != -1 && !S_ISREG(st.st_mode)) {
> - rprintf(FERROR,"%s : not a regular file
(recv_files)\n",fnamecmp);
> + rprintf(FERROR,"%s : not a regular file
(recv_files)\n",fname);
> receive_data(f_in,NULL,-1,NULL,file->length);
> close(fd1);
> continue;
> @@ -403,7 +398,7 @@
> if (fd1 != -1 && st.st_size > 0) {
> buf = map_file(fd1,st.st_size);
> if (verbose > 2)
> - rprintf(FINFO,"recv mapped %s of size
%.0f\n",fnamecmp,(double)st.st_size);
> + rprintf(FINFO,"recv mapped %s of size
%.0f\n",fname,(double)st.st_size);
> } else {
> buf = NULL;
> }
>
> --
> A bad analogy is like a leaky screwdriver -- Richard Braakman
--
Martin