Hi, A frequently seen issue with importing SPSS data files, is that R does not import the 'long variable names'. I built a patch on the R-project's foreign module, in order to import the 'long variable names' from SPSS (record 7, subtype 13). To complete the job, I had to expand the "struct variable" definition to have 64 +1 charachters. I'm not aware of side effects. The sfm-read.c code works fine. I didn't test a variety of platforms, as I don't have an idea of what is regarded as sufficient testing. Anyway, I don't expect major troubles there (no byteswapping problems, no 32<->64 bit issues) as it's mainly character processing. The patch is relative to the foreign directory. It was created against the trunk of R-project yesterday. We would appreciate that you import such patch into the main tree. Kind regards, Kurt Van Dijck (C programmer) & Ilse Laurijssen (R user) Belgium Index: src/sfm-read.c ==================================================================--- src/sfm-read.c (revision 5168) +++ src/sfm-read.c (working copy) @@ -188,6 +188,8 @@ static int read_variables (struct file_handle * h, struct variable *** var_by_index); static int read_machine_int32_info (struct file_handle * h, int size, int count, int *encoding); static int read_machine_flt64_info (struct file_handle * h, int size, int count); +static int read_long_var_names (struct file_handle * h, struct dictionary * + , unsigned long size, unsigned int count); static int read_documents (struct file_handle * h); /* Displays the message X with corrupt_msg, then jumps to the lossage @@ -418,11 +420,15 @@ break; case 7: /* Multiple-response sets (later versions of SPSS). */ - case 13: /* long variable names. PSPP now has code for these - that could be ported if someone is interested. */ skip = 1; break; + case 13: /* long variable names. PSPP now has code for these + that could be ported if someone is interested. */ + if (!read_long_var_names(h, ext->dict, data.size, data.count)) + goto lossage; + break; + case 16: /* See http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */ skip = 1; break; @@ -584,14 +590,72 @@ return 0; } +/* Read record type 7, subtype 13. + * long variable names + */ static int +read_long_var_names (struct file_handle * h, struct dictionary * dict + , unsigned long size, unsigned int count) +{ + char * data; + unsigned int j; + struct variable ** lp; + struct variable ** end; + char * p; + char * endp; + char * val; + if ((1 != size)||(0 == count)) { + warning("%s: strange record info seen, size=%u, count=%u" + ", ignoring long variable names" + , h->fn, size, count); + return 0; + } + size *= count; + data = Calloc (size +1, char); + bufread(h, data, size, 0); + /* parse */ + end = &dict->var[dict->nvar]; + p = data; + do { + if (0 != (endp = strchr(p, '\t'))) + *endp = 0; /* put null terminator */ + if (0 == (val = strchr(p, '='))) { + warning("%s: no long variable name for variable '%s'", h->fn, p); + } else { + *val = 0; + ++val; + /* now, p is key, val is long name */ + for (lp = dict->var; lp < end; ++lp) { + if (!strcmp(lp[0]->name, p)) { + strncpy(lp[0]->name, val, sizeof(lp[0]->name)); + break; + } + } + if (lp >= end) { + warning("%s: long variable name mapping '%s' to '%s'" + "for variable which does not exist" + , h->fn, p, val); + } + } + p = &endp[1]; /* put to next */ + } while (endp); + + free(data); + return 1; + +lossage: + free(data); + return 0; +} + +static int read_header (struct file_handle * h, struct sfm_read_info * inf) { struct sfm_fhuser_ext *ext = h->ext; /* File extension strcut. */ struct sysfile_header hdr; /* Disk buffer. */ struct dictionary *dict; /* File dictionary. */ char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */ - int skip_amt = 0; /* Amount of product name to omit. */ + int skip_amt = 0; /* Amount of product name to omit. */ int i; /* Create the dictionary. */ @@ -1495,7 +1559,7 @@ /* Reads one case from system file H into the value array PERM according to the instructions given in associated dictionary DICT, which must have the get.* elements appropriately set. Returns - nonzero only if successful. */ + nonzero only if successful. */ int sfm_read_case (struct file_handle * h, union value * perm, struct dictionary * dict) { Index: src/var.h.in ==================================================================--- src/var.h.in (revision 5168) +++ src/var.h.in (working copy) @@ -41,6 +41,10 @@ #error MAX_SHORT_STRING must be less than 8. #endif +/* VAR_NAME_LEN: the length of a variable. + * SPSS supports names of 64 long + */ +#define VAR_NAME_LEN 64 /* Special values. */ #define SYSMIS (-DBL_MAX) #define LOWEST second_lowest_double_val() @@ -228,7 +232,7 @@ /* MODIFY VARS private data. */ struct modify_vars_proc { - char new_name[9]; /* Variable's new name. */ + char new_name[VAR_NAME_LEN +1]; /* Variable's new name. */ int drop_this_var; /* 0=keep this var, 1=drop this var. */ struct variable *next; /* Next in linked list. */ }; @@ -302,7 +306,7 @@ struct variable { /* Required by parse_variables() to be in this order. */ - char name[9]; /* As a string. */ + char name[VAR_NAME_LEN +1]; /* As a string. */ int index; /* Index into its dictionary's var[]. */ int type; /* NUMERIC or ALPHA. */ int foo; /* Used for temporary storage. */ @@ -373,9 +377,9 @@ int weight_index; /* `value' index of $WEIGHT, or -1 if none. Call update_weighting() before using! */ - char weight_var[9]; /* Name of WEIGHT variable. */ + char weight_var[VAR_NAME_LEN];/* Name of WEIGHT variable. */ - char filter_var[9]; /* Name of FILTER variable. */ + char filter_var[VAR_NAME_LEN];/* Name of FILTER variable. */ /* Do not make another field the last field! or see temporary.c:restore_dictionary() before doing so! */ };
Hi all, I got no feedback at all concerning the merge of this patch in the source tree. Am I supposed to do this myself? How should I do this (do I have subversion commit access)? Is this patch acceptable at all? Is it being tested? I got some personal reactions on my post, proving there is general interest in getting rid of the inconvenience of importing long labels from SPSS files. Kurt Van Dijck wrote:> Hi, > > A frequently seen issue with importing SPSS data files, is that R does > not import the 'long variable names'. > I built a patch on the R-project's foreign module, in order to import > the 'long variable names' from SPSS (record 7, subtype 13). > To complete the job, I had to expand the "struct variable" definition > to have 64 +1 charachters. I'm not aware of side effects. > The sfm-read.c code works fine. > I didn't test a variety of platforms, as I don't have an idea of what is > regarded as sufficient testing. Anyway, I don't expect major troubles there > (no byteswapping problems, no 32<->64 bit issues) as it's mainly > character processing. > The patch is relative to the foreign directory. It was created > against the trunk of R-project yesterday. > > We would appreciate that you import such patch into the main tree. > > Kind regards, > > Kurt Van Dijck (C programmer) & Ilse Laurijssen (R user) > Belgium > > Index: src/sfm-read.c > ==================================================================> --- src/sfm-read.c (revision 5168) > +++ src/sfm-read.c (working copy) > @@ -188,6 +188,8 @@ > static int read_variables (struct file_handle * h, struct variable *** > var_by_index); > static int read_machine_int32_info (struct file_handle * h, int size, > int count, int *encoding); > static int read_machine_flt64_info (struct file_handle * h, int size, > int count); > +static int read_long_var_names (struct file_handle * h, struct > dictionary * > + , unsigned long size, unsigned int count); > static int read_documents (struct file_handle * h); > > /* Displays the message X with corrupt_msg, then jumps to the lossage > @@ -418,11 +420,15 @@ > break; > > case 7: /* Multiple-response sets (later versions of SPSS). */ > - case 13: /* long variable names. PSPP now has code for these > - that could be ported if someone is interested. */ > skip = 1; > break; > > + case 13: /* long variable names. PSPP now has code for these > + that could be ported if someone is interested. */ > + if (!read_long_var_names(h, ext->dict, data.size, data.count)) > + goto lossage; > + break; > + > case 16: /* See > http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */ > skip = 1; > break; > @@ -584,14 +590,72 @@ > return 0; > } > > +/* Read record type 7, subtype 13. > + * long variable names > + */ > static int > +read_long_var_names (struct file_handle * h, struct dictionary * dict > + , unsigned long size, unsigned int count) > +{ > + char * data; > + unsigned int j; > + struct variable ** lp; > + struct variable ** end; > + char * p; > + char * endp; > + char * val; > + if ((1 != size)||(0 == count)) { > + warning("%s: strange record info seen, size=%u, count=%u" > + ", ignoring long variable names" > + , h->fn, size, count); > + return 0; > + } > + size *= count; > + data = Calloc (size +1, char); > + bufread(h, data, size, 0); > + /* parse */ > + end = &dict->var[dict->nvar]; > + p = data; > + do { > + if (0 != (endp = strchr(p, '\t'))) > + *endp = 0; /* put null terminator */ > + if (0 == (val = strchr(p, '='))) { > + warning("%s: no long variable name for variable '%s'", h->fn, p); > + } else { > + *val = 0; > + ++val; > + /* now, p is key, val is long name */ > + for (lp = dict->var; lp < end; ++lp) { > + if (!strcmp(lp[0]->name, p)) { > + strncpy(lp[0]->name, val, sizeof(lp[0]->name)); > + break; > + } > + } > + if (lp >= end) { > + warning("%s: long variable name mapping '%s' to '%s'" > + "for variable which does not exist" > + , h->fn, p, val); > + } > + } > + p = &endp[1]; /* put to next */ > + } while (endp); > + > + free(data); > + return 1; > + > +lossage: > + free(data); > + return 0; > +} > + > +static int > read_header (struct file_handle * h, struct sfm_read_info * inf) > { > struct sfm_fhuser_ext *ext = h->ext; /* File extension strcut. */ > struct sysfile_header hdr; /* Disk buffer. */ > struct dictionary *dict; /* File dictionary. */ > char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product > name. */ > - int skip_amt = 0; /* Amount of product name to omit. */ > + int skip_amt = 0; /* Amount of product name to omit. */ > int i; > > /* Create the dictionary. */ > @@ -1495,7 +1559,7 @@ > /* Reads one case from system file H into the value array PERM > according to the instructions given in associated dictionary DICT, > which must have the get.* elements appropriately set. Returns > - nonzero only if successful. */ > + nonzero only if successful. */ > int > sfm_read_case (struct file_handle * h, union value * perm, struct > dictionary * dict) > { > Index: src/var.h.in > ==================================================================> --- src/var.h.in (revision 5168) > +++ src/var.h.in (working copy) > @@ -41,6 +41,10 @@ > #error MAX_SHORT_STRING must be less than 8. > #endif > > +/* VAR_NAME_LEN: the length of a variable. > + * SPSS supports names of 64 long > + */ > +#define VAR_NAME_LEN 64 > /* Special values. */ > #define SYSMIS (-DBL_MAX) > #define LOWEST second_lowest_double_val() > @@ -228,7 +232,7 @@ > /* MODIFY VARS private data. */ > struct modify_vars_proc > { > - char new_name[9]; /* Variable's new name. */ > + char new_name[VAR_NAME_LEN +1]; /* Variable's new name. */ > int drop_this_var; /* 0=keep this var, 1=drop this var. */ > struct variable *next; /* Next in linked list. */ > }; > @@ -302,7 +306,7 @@ > struct variable > { > /* Required by parse_variables() to be in this order. */ > - char name[9]; /* As a string. */ > + char name[VAR_NAME_LEN +1]; /* As a string. */ > int index; /* Index into its dictionary's var[]. */ > int type; /* NUMERIC or ALPHA. */ > int foo; /* Used for temporary storage. */ > @@ -373,9 +377,9 @@ > > int weight_index; /* `value' index of $WEIGHT, or -1 if none. > Call update_weighting() before using! */ > - char weight_var[9]; /* Name of WEIGHT variable. */ > + char weight_var[VAR_NAME_LEN];/* Name of WEIGHT variable. */ > > - char filter_var[9]; /* Name of FILTER variable. */ > + char filter_var[VAR_NAME_LEN];/* Name of FILTER variable. */ > /* Do not make another field the last field! or see > temporary.c:restore_dictionary() before doing so! */ > }; > > ______________________________________________ > R-devel at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-devel
On Tue, Jul 15, 2008 at 09:29:22AM +0100, Prof Brian Ripley wrote:> On Tue, 15 Jul 2008, Martin Maechler wrote: > > >Hi Kurt, > > > >>>>>>"KVD" == Kurt Van Dijck <kurt.van.dijck at skynet.be> > >>>>>> on Wed, 09 Jul 2008 10:05:39 +0200 writes: > > > > KVD> Hi all, I got no feedback at all concerning the merge > > KVD> of this patch in the source tree. Am I supposed to do > > KVD> this myself? How should I do this (do I have subversion > > KVD> commit access)? Is this patch acceptable at all? Is it > > KVD> being tested? > > > >I don't know if it's being tested. > >It's vacation and traveling time, also for the R core team. > > Indeed. This is on my TODO list, but I've been away (and unexpectedly > mainly offline) for the last two weeks, and will be again until Friday. > > Hopefully I will have a chance to take a look next week, but we do need > at least one example file. (I could generate SPSS examples, but they may > not be what you are trying to test.) > > > > >The foreign package source is kept in svn-archive > > https://svn.r-project.org/R-packages/trunk/foreign/ > > > >and I have tried to apply your patch (from July 2) to the > >sources but > > > > patch -p0 < K_Van_Dijck_patch > > > > patching file src/sfm-read.c > > Hunk #1 FAILED at 188. > > Hunk #2 FAILED at 420. > > Hunk #3 FAILED at 590. > > Hunk #4 FAILED at 1559. > > 4 out of 4 hunks FAILED -- saving rejects to file src/sfm-read.c.rej > > patching file src/var.h.in > > Hunk #1 FAILED at 41. > > Hunk #2 FAILED at 232. > > Hunk #3 FAILED at 306. > > Hunk #4 FAILED at 377. > > 4 out of 4 hunks FAILED -- saving rejects to file src/var.h.in.rej > > > > > >Could you provide a patch against the development code from the > >above url ? > >(after installing 'subversion', you get the development directory by > > svn co https://svn.r-project.org/R-packages/trunk/foreign/ > >)I had problems with whitespace in the patch file, I attached a new one> > KVD> I got some personal reactions on my post, proving there > > KVD> is general interest in getting rid of the inconvenience > > KVD> of importing long labels from SPSS files. > > > >My problem is that I cannot do much testing apart from the tests > >already present in foreign/tests/spss.R > > > >Could you provide a new small *.sav file and a corresponding > >read.spss() call which exhibits the > >problems and is fixed by your patch? > > > >Thank you in advance for your contribution! > >Best regards, > >I ran the spss.R in tests/, it worked fine. Be sure to clean all object files before compiling. Ilse made me a test .sav file (attached) with 2 variables (varialbe1 & variable2), 3 records. This piece of R code shows the problem: # to resolve locale problem Sys.setlocale (locale="C"); # read spss datafile library(foreign); data = read.spss("spss_long.sav", to.data.frame=TRUE); # to.data.frame not necessary, but gives nicer output # commands to show the data, the variable names and labels data; names(data); attr(data, "variable.labels"); # result in unpatched version: # both variable names are in shortened form # (max 8 characters; provided in SPSS-file) #> data; # VARIABLE V2_A #1 1 1 #2 2 1 #3 2 3 # #> names(data); #[1] "VARIABLE" "V2_A" # #> attr(data,"variable.labels"); # VARIABLE V2_A #"variable1" "variable2" # and in patched version: # variable names are the full names as originally defined in the SPSS-file #> data; # variable1 variable2 #1 1 1 #2 2 1 #3 2 3 #> names(data); #[1] "variable1" "variable2" #> attr(data, "variable.labels"); # variable1 variable2 #"variable1" "variable2" Kind regards, Kurt & Ilse Index: src/sfm-read.c ==================================================================--- src/sfm-read.c (revision 5175) +++ src/sfm-read.c (working copy) @@ -188,6 +188,8 @@ static int read_variables (struct file_handle * h, struct variable *** var_by_index); static int read_machine_int32_info (struct file_handle * h, int size, int count, int *encoding); static int read_machine_flt64_info (struct file_handle * h, int size, int count); +static int read_long_var_names (struct file_handle * h, struct dictionary * + , unsigned long size, unsigned int count); static int read_documents (struct file_handle * h); /* Displays the message X with corrupt_msg, then jumps to the lossage @@ -418,11 +420,15 @@ break; case 7: /* Multiple-response sets (later versions of SPSS). */ - case 13: /* long variable names. PSPP now has code for these - that could be ported if someone is interested. */ skip = 1; break; + case 13: /* long variable names. PSPP now has code for these + that could be ported if someone is interested. */ + if (!read_long_var_names(h, ext->dict, data.size, data.count)) + goto lossage; + break; + case 16: /* See http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */ skip = 1; break; @@ -584,14 +590,72 @@ return 0; } +/* Read record type 7, subtype 13. + * long variable names + */ static int +read_long_var_names (struct file_handle * h, struct dictionary * dict + , unsigned long size, unsigned int count) +{ + char * data; + unsigned int j; + struct variable ** lp; + struct variable ** end; + char * p; + char * endp; + char * val; + if ((1 != size)||(0 == count)) { + warning("%s: strange record info seen, size=%u, count=%u" + ", ignoring long variable names" + , h->fn, size, count); + return 0; + } + size *= count; + data = Calloc (size +1, char); + bufread(h, data, size, 0); + /* parse */ + end = &dict->var[dict->nvar]; + p = data; + do { + if (0 != (endp = strchr(p, '\t'))) + *endp = 0; /* put null terminator */ + if (0 == (val = strchr(p, '='))) { + warning("%s: no long variable name for variable '%s'", h->fn, p); + } else { + *val = 0; + ++val; + /* now, p is key, val is long name */ + for (lp = dict->var; lp < end; ++lp) { + if (!strcmp(lp[0]->name, p)) { + strncpy(lp[0]->name, val, sizeof(lp[0]->name)); + break; + } + } + if (lp >= end) { + warning("%s: long variable name mapping '%s' to '%s'" + "for variable which does not exist" + , h->fn, p, val); + } + } + p = &endp[1]; /* put to next */ + } while (endp); + + free(data); + return 1; + +lossage: + free(data); + return 0; +} + +static int read_header (struct file_handle * h, struct sfm_read_info * inf) { struct sfm_fhuser_ext *ext = h->ext; /* File extension strcut. */ struct sysfile_header hdr; /* Disk buffer. */ struct dictionary *dict; /* File dictionary. */ char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */ - int skip_amt = 0; /* Amount of product name to omit. */ + int skip_amt = 0; /* Amount of product name to omit. */ int i; /* Create the dictionary. */ @@ -1495,7 +1559,7 @@ /* Reads one case from system file H into the value array PERM according to the instructions given in associated dictionary DICT, which must have the get.* elements appropriately set. Returns - nonzero only if successful. */ + nonzero only if successful. */ int sfm_read_case (struct file_handle * h, union value * perm, struct dictionary * dict) { Index: src/var.h.in ==================================================================--- src/var.h.in (revision 5175) +++ src/var.h.in (working copy) @@ -41,6 +41,10 @@ #error MAX_SHORT_STRING must be less than 8. #endif +/* VAR_NAME_LEN: the length of a variable. + * SPSS supports names of 64 long + */ +#define VAR_NAME_LEN 64 /* Special values. */ #define SYSMIS (-DBL_MAX) #define LOWEST second_lowest_double_val() @@ -228,7 +232,7 @@ /* MODIFY VARS private data. */ struct modify_vars_proc { - char new_name[9]; /* Variable's new name. */ + char new_name[VAR_NAME_LEN +1]; /* Variable's new name. */ int drop_this_var; /* 0=keep this var, 1=drop this var. */ struct variable *next; /* Next in linked list. */ }; @@ -302,7 +306,7 @@ struct variable { /* Required by parse_variables() to be in this order. */ - char name[9]; /* As a string. */ + char name[VAR_NAME_LEN +1]; /* As a string. */ int index; /* Index into its dictionary's var[]. */ int type; /* NUMERIC or ALPHA. */ int foo; /* Used for temporary storage. */ @@ -373,9 +377,9 @@ int weight_index; /* `value' index of $WEIGHT, or -1 if none. Call update_weighting() before using! */ - char weight_var[9]; /* Name of WEIGHT variable. */ + char weight_var[VAR_NAME_LEN];/* Name of WEIGHT variable. */ - char filter_var[9]; /* Name of FILTER variable. */ + char filter_var[VAR_NAME_LEN];/* Name of FILTER variable. */ /* Do not make another field the last field! or see temporary.c:restore_dictionary() before doing so! */ };