Hi,
Try:
library(stringr)
##### Created the selected files (98) in a separate working? folder
(SubsetFiles1) (refer to my previous mail)
filelst <- list.files()
#Sublst <- filelst[1:2]
res <- lapply(filelst,function(x) {con <- file(x)
??? ?Lines1 <- readLines(con) close(con)
??? ?Lines2 <- Lines1[-1]
??? ?Lines3 <- str_split(Lines2,"-9999.9M")
??? ?Lines4 <- str_trim(unlist(lapply(Lines3,function(x) {x[x==""]
<- NA
??? ?paste(x,collapse=" ")})))
??? ?Lines5 <- gsub("(\\d+)[A-Za-z]","\\1",Lines4)
??? ?res1 <- read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)
??? ?res1})
##Created another folder "Modified" to store the "res" files
lapply(seq_along(res),function(i)
write.table(res[[i]],paste("/home/arunksa111/Zl/Modified",paste0("Mod_",filelst[i]),sep="/"),row.names=FALSE,quote=FALSE))
?lstf1 <- list.files(path="/home/arunksa111/Zl/Modified")?
lst1 <- lapply(lstf1,function(x)
readLines(paste("/home/arunksa111/Zl/Modified",x,sep="/")))
?which(lapply(lst1,function(x) length(grep("\\d+-9999.9",x)))>0 )
?#[1]? 7 11 14 15 30 32 39 40 42 45 46 53 60 65 66 68 69 70 73 74 75 78 80 82 83
#[26] 86 87 90 91 93
lst2 <- lapply(lst1,function(x) gsub("(\\d+)(-9999.9)","\\1
\\2",x))
?#lapply(lst2,function(x) x[grep("\\d+-9999.9",x)]) ##checking for the
pattern
lst3 <- lapply(lst2,function(x)
{x<-gsub("(-9999.9)(-9999.9)","\\1 \\2",x)})#
#lapply(lst3,function(x) x[grep("\\d+-9999.9",x)])? ##checking for the
pattern
# lapply(lst3,function(x) x[grep("-9999.9",x)]) ###second check
lst4 <- lapply(lst3,function(x) gsub("(Day)
(\\d+)","\\1_\\2", x[-1]))? #removed the additional header
"V1", "V2", etc.
#sapply(lst4,function(x) length(strsplit(x[1]," ")[[1]])) #checking
the number of columns that should be present
lst5 <- lapply(lst4,function(x) unlist(lapply(x, function(y) word(y,1,33))))
lst6 <- lapply(lst5,function(x)
read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
# head(lst6[[94]],3)
lst7 <- lapply(lst6,function(x) x[x$Year >=1961 & x$Year <=2005,])
#head(lst7[[45]],3)
?lst8 <- lapply(lst7,function(x) x[!is.na(x$Year),])
lst9 <- lapply(lst8,function(x) {
??? if((min(x$Year)>1961)|(max(x$Year)<2005)){
????? n1<- (min(x$Year)-1961)*12
????? x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
????? n2<- (2005-max(x$Year))*12
????? x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
?????? colnames(x1) <- colnames(x)
?????? colnames(x2) <- colnames(x)??? ???
????? x3<- rbind(x1,x,x2)
??? }
?? else if((min(x$Year)==1961) & (max(x$Year)==2005)) {
??? ????? if((min(x$Mo[x$Year==1961])>1)|(max(x$Mo[x$Year==2005])<12)){
??? ?? n1 <- min(x$Mo[x$Year==1961])-1
??? ?? x1 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
??? ?? n2 <- (12-max(x$Mo[x$Year==2005])) ??? ????
??? ?? x2 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
??? ?? colnames(x1) <- colnames(x)
??? ?? colnames(x2) <- colnames(x)
??? ?? x3 <- rbind(x1,x,x2)
??? ? }
??? ??? else {???
??? ??? x
??? }
????
??? } })
which(sapply(lst9,nrow)!=540)
#[1] 45 46 54 64 65 66 70 75 97
lst10 <- lapply(lst9,function(x) {x1 <- x[!is.na(x$Year),]
??? ??? ??? ?hx1 <- head(x1,1)
??? ??? ??? ?tx1 <- tail(x1,1)
??? ??? ??? ?x2 <- as.data.frame(matrix(NA, ncol=ncol(x), nrow=hx1$Mo-1))
??? ??? ??? ?x3 <- as.data.frame(matrix(NA,ncol=ncol(x),nrow=12-tx1$Mo))
??? ??? ??? ?colnames(x2) <- colnames(x)
??? ??? ??? ?colnames(x3) <- colnames(x)
??? ??? ??? ?if(nrow(x) < 540) rbind(x2,x,x3) else x? })
which(sapply(lst10,nrow)!=540)
#integer(0)
lst11 <-lapply(lst10,function(x)
data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE)))
? lst12<- lapply(seq_along(lst10),function(i){
??? x<- lst11[[i]]
??? colnames(x)<- lstf1[i]
??? row.names(x)<- 1:nrow(x)
??? x
? })
res2 <-? do.call(cbind,lst11)
?dim(res2)
#[1] 16740??? 98
?
res2[res2==-9999.9]<-NA # change missing value identifier as in your data set
which(res2==-9999.9)
#integer(0)
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst12<-lapply(sldat,function(x)
lapply(split(x,gsub(".*-(.*)-.*","\\1",x)),
function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0)
{x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)}
else y} ))
any(sapply(lst12,function(x) any(lapply(x,length)!=31)))
#[1] FALSE
lst22<-lapply(lst12,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res3 <- data.frame(dates=dates3,res2,stringsAsFactors=FALSE)
str(res3)
res3$dates<-as.Date(res3$dates)
res4 <- res3[!is.na(res3$dates),]
res4[1:3,1:3]
dim(res4)
?#[1] 16436??? 99
A.K.
On Friday, November 8, 2013 5:54 PM, Zilefac Elvis <zilefacelvis at
yahoo.com> wrote:
Hi Ak,
I think I figured out how to do the sub-setting. All I needed was to use column
3 in Temperature_inventory and select matching .txt files in the .zip file. The
final result would be a subset of files whose IDs are in column 3 of
temp_inventory.
*************************************************************************
I also have this script which you developed for managing precipitation files.
Now I want to use the same code for the temperature files I sent to you. I tried
doing it with some errors.
Please try these scripts on my temperature data. If you need further information
let me know.
Note here that -9999.99M is -9999.9M in the temperature files.
library(stringr)# load it
res<-lapply(temp,function(x) {con <- file(x);
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?Lines1<- readLines(con);
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?close(con);
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?Lines2<-Lines1[-1];# myfiles contain headers
in row 2, so I removed the headers
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?Lines3<-
str_split(Lines2,"-9999.99M");
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?Lines4<-
str_trim(unlist(lapply(Lines3,function(x){x[x==""]<-NA;#replace
missing identifier with NA
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
? ? paste(x,collapse=" ")})));
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?Lines5<-
gsub("(\\d+)[A-Za-z]","\\1",Lines4);
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?res<-
read.table(text=Lines5,sep="",header=FALSE,fill=TRUE)})
lapply(res,head,2)# take a look at first two rows of res.
lapply(seq_along(res),function(i)
write.table(res[[i]],paste0(gsub(".txt","",temp[i]),".txt"),row.names=FALSE,quote=FALSE))
#********************************************************************************************************
# Then use the following as a continuation from the one above
lstf1<- list.files(pattern=".txt")
length(lstf1)
fun2<- function(lstf){
? lst1<-lapply(lstf,function(x) readLines(x))
? lst2<-lapply(lst1,function(x) {gsub("(\\d+)(-9999.99)","\\1
\\2",x)})#change missing value identifier as in your data set
? lst3<-lapply(lst2,function(x)
{x<-gsub("(\\d+)(-9999.99)","\\1 \\2",x)})#change missing
value identifier as in your data set
? lst4<- lapply(lst3,function(x)
read.table(text=x,header=TRUE,stringsAsFactors=FALSE,sep="",fill=TRUE))
? lst5<- lapply(lst4,function(x) x[x$V1>=1961 & x$V1<=2005,])
? lst6<- lapply(lst5,function(x) x[!is.na(x$V1),])
? lst7<- lapply(lst6,function(x) {
? ? if((min(x$V1)>1961)|(max(x$V1)<2005)){
? ? ? n1<- (min(x$V1)-1961)*12
? ? ? x1<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n1))
? ? ? n2<- (2005-max(x$V1))*12
? ? ? x2<- as.data.frame(matrix(NA,ncol=ncol(x),nrow=n2))
? ? ? x3<- rbind(x1,x,x2)
? ? }
? ? else {
? ? ? x
? ? } })
? lst8<-lapply(lst7,function(x)
data.frame(col1=unlist(data.frame(t(x)[-c(1:2),]),use.names=FALSE))) ####changed
? lst9<- lapply(seq_along(lst8),function(i){
? ? x<- lst8[[i]]
? ? colnames(x)<- lstf1[i]
? ? row.names(x)<- 1:nrow(x)
? ? x
? })
? do.call(cbind,lst9)}
res<-fun2(lstf1)
dim(res)
res[res==-9999.99]<-NA # change missing value identifier as in your data set
which(res==-9999.99)#change missing value identifier as in your data set
dates1<-seq.Date(as.Date('1Jan1961',format="%d%b%Y"),as.Date('31Dec2005',format="%d%b%Y"),by="day")
dates2<- as.character(dates1)
sldat<- split(dates2,list(gsub("-.*","",dates2)))
lst11<-lapply(sldat,function(x)
lapply(split(x,gsub(".*-(.*)-.*","\\1",x)),
function(y){x1<-as.numeric(gsub(".*-.*-(.*)","\\1",y));if((31-max(x1))>0)
{x2<-seq(max(x1)+1,31,1);x3<-paste0(unique(gsub("(.*-.*-).*","\\1",y)),x2);c(y,x3)}
else y} ))
any(sapply(lst1,function(x) any(lapply(x,length)!=31)))
lst22<-lapply(lst11,function(x) unlist(x,use.names=FALSE))
sapply(lst22,length)
dates3<-unlist(lst22,use.names=FALSE)
length(dates3)
res1<- data.frame(dates=dates3,res,stringsAsFactors=FALSE)
str(res1)
res1$dates<-as.Date(res1$dates)
res2<-res1[!is.na(res1$dates),]
res2[1:3,1:3]
dim(res2)
write.csv(res2, file = "TemperatureAllstations.csv")#
#***********************************************************************************
Waiting for your useful input.
Thanks so much,
Atem.
On Friday, November 8, 2013 2:18 PM, arun <sm
you wanted to do.? If you want to transfer the subset of files from the main
folder to a new location, then you may try: (make sure you create a copy of the
original .txt folder before doing this)
I created three sub folders and two files (BTemperature_Stations.txt and
Tempearture inventory.csv) in my working directory.
list.files()
#[1] "BTemperature_Stations.txt" "Files1"????????? ## Files1
folder contains all the .txt files; #SubsetFiles: created to subset the files
that match the condition????????????????
#[3]
"FilesCopy"???????????????? "SubsetFiles1"?????????
#FilesCopy. A copy of the Files1 folder??
#[5] "Tempearture inventory.csv"
list.files(pattern="\\.")
#[1] "BTemperature_Stations.txt" "Tempearture inventory.csv"
fl1 <- list.files(pattern="\\.")
?dat1 <-
read.table(fl1[1],header=TRUE,sep="",stringsAsFactors=FALSE,fill=TRUE,check.names=FALSE)
?dat2 <-
read.csv(fl1[2],header=TRUE,sep=",",stringsAsFactors=FALSE,check.names=FALSE)
vec1 <- dat1[,3][dat1[,3]%in% dat2[,3]]
vec2 <-
list.files(path="/home/arunksa111/Zl/Files1",recursive=TRUE)
?sum(gsub(".txt","",vec2) %in% vec1)
#[1] 98
vec3 <-? vec2[gsub(".txt","",vec2) %in% vec1]
lapply(vec3, function(x)
file.rename(paste("/home/arunksa111/Zl/Files1",x,sep="/"),
paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/")))
#change the path accordingly.
length(list.files(path="/home/arunksa111/Zl/SubsetFiles1"))
#[1] 98
fileDim <- sapply(vec3,function(x) {x1
<-read.delim(paste("/home/arunksa111/Zl/SubsetFiles1",x,sep="/"),header=TRUE,stringsAsFactors=FALSE,sep=",",check.names=FALSE);
dim(x1)})
fileDim[,1:3]
#???? dn3011120.txt dn3011240.txt dn3011887.txt
#[1,]????????? 1151?????????? 791????????? 1054
#[2,]???????????? 7???????????? 7???????????? 7
A.K.
On Friday, November 8, 2013 1:41 PM, Zilefac Elvis <
les from a list of files. All are text files. The index for selection is found
in column 3 of both files.
Attached are my data files.
Btemperature_Stations is my
main file.
Temperature inventory is my 'wanted' file and is a subset of
Btemperature_Stations.
Using column 3 in both files, select the files in?Temperature inventory
from?Btemperature_Stations.
The .zip file contains the .txt files which you will extract to a folder and do
the selection in R.
Thanks,
Atem.