Paul Bernal
2023-Sep-30 16:45 UTC
[R] Trouble with FAILDATE when using the aggregate() and ts() functions
Dear friends, First of all, the following is the R version I am working with: [64-bit] C:\Program Files\R\R-4.3.1 The packages I am using are the following: library("dplyr") library("lubridate") library("tidyverse") library("readxl") library("stats") #reading data for distribution fitting failuredf <- as.data.frame(read_excel("FailureData.xlsx")) failuredf2 <- subset(failuredf, !is.na(FAILDATE)) failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){ failuredf2$Fails = 1 }else{ failuredf2$Fails = 0 } #reading data for distribution fitting failuredf <- as.data.frame(read_excel("FailureData.xlsx")) failuredf2 <- subset(failuredf, !is.na(FAILDATE)) dput(head(failuredf2)) structure(list(WONUM = c("946936", "944757", "946194", "897595", "897714", "898221"), ASSET = c("REMLOSSANTOS", "REMCAMPANA", "REMBAYANO", "REMCHIRIQUI", "REMDARIEN", "REMTUIRA"), INSTALLDATE structure(c(1039910400, NA, 1223424000, 1200355200, 1175385600, 1220918400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), LOCATION = c("DISTRITO-NOR", "DISTRITO-NOR", "DISTRITO-NOR", "DISTRITO-SUR", "DISTRITO-NOR", "DISTRITO-NOR" ), DESCRIPTION = c("APOYO DE ATF- LIMPIEZA DE TANQUES DE COMBUSTIBLE REMOLCADOR LOS SANTOS", "APOYO DE ATF- LIMPIEZA DE LOS TANQUES DE COMBUSTIBLE 4B Y 4C (PORT Y STB DAY TANKS) CERRO CAMPANA", "APOYO DE ATF- LIMPIEZA Y PINTURA CUARTO DE M?QUINAS REMOLCADOR BAYANO", "Para FDS - Unidad de propulsi?n de babor con movimientos erraticos", "Para FDS - Bunkering", "Para FDS - Problemas intermitentes en el sistema de embrague de la unidad MCD en el SME" ), WORKODER_REPORTDATE = structure(c(1517910018, 1517211639, 1517563662, 1510655175, 1510733776, 1510909387), class = c("POSIXct", "POSIXt"), tzone = "UTC"), WORKTYPE = c("MC", "MC", "MC", "MC", "MC", "MC"), COMPONENTE = c("FSE", "FSE", "FSE", "MPS", "MME", "CSE"), COMP_DESCRIPTION = c("fuel system", "fuel system", "fuel system", "miscellaneous propulsion & steering", "miscellaneous main engine", "control system"), FAILURECODE = c("REM", "REM", "REM", "REM", "REM", "REM"), FAILDATE = structure(c(1520438053, 1520438271, 1520438369, 1492283280, 1497858240, 1476959280), class = c("POSIXct", "POSIXt"), tzone = "UTC"), ORGID = c("ACP", "ACP", "ACP", "ACP", "ACP", "ACP"), TICKETCLASS = c(NA, NA, NA, NA, NA, NA), PROBLEMA c("MENG", "MENG", "MENG", "PROSY", "MENG", "MENG"), PROBLEMA_CODE_DESC = c("MAIN ENGINES", "MAIN ENGINES", "MAIN ENGINES", "Propulsion system", "MAIN ENGINES", "MAIN ENGINES"), CAUSA = c(NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_), CAUSA_CODE_DESC c(NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_ ), REMEDIO = c(NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_), REMEDIO_CODE_DESC = c(NA_character_, NA_character_, NA_character_, NA_character_, NA_character_, NA_character_ )), row.names = c(NA, 6L), class = "data.frame") I basically want to group the dataset by FAILDATE and then just count the number of failures. It would be equivalent to creating a pivot table where you specify FAILDATE as rows and then count any field (in this case WONUM). #Here I format FAILDATE as YYYY-MM failuredf2["faildateproc"] = format(failuredf2$FAILDATE, format='%Y-%m') #I just select the fields I am interested in failuredf3 <- select(failuredf2, faildateproc, Fails) head(failuredf3) #I do the aggregation here failuredf3.sum <- aggregate(x = failuredf3[c("Fails")], FUN = sum, by = list(FailDate = failuredf3$faildateproc)) This actually works, but the problem comes when I try to extract the year and month from FailDate when I apply function ts() (to transform the data to a ts object (it is worth mentioning that the aggregate() function doesn't work if I do as.Date(failuredf$FAILDATE, format='%Y-%m'). By the way, it is worth mentioning that this is the aggregate function from the stats package. Look at the output when I do not apply the as.Date() funciton to failuredf$FAILDATE: head(failuredf3.sum) FailDate Fails 1 2016-09 1 2 2016-10 1 3 2016-11 3 4 2016-12 4 5 2017-01 5 6 2017-02 6 this is doing the aggregation properly, but if I apply the as.Date() function first, look what happens when I try to aggregate: #reading data for distribution fitting failuredf <- as.data.frame(read_excel("FailureData.xlsx")) failuredf2 <- subset(failuredf, !is.na(FAILDATE)) dput(head(failuredf2)) failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){ failuredf2$Fails = 1 }else{ failuredf2$Fails = 0 } failuredf2["faildateproc"] = as.Date(failuredf2$FAILDATE, format='%Y-%m')) failuredf3 <- select(failuredf2, faildateproc, Fails) head(failuredf3) failuredf3.sum <- aggregate(x = failuredf3[c("Fails")], FUN = sum, by = list(FailDate = failuredf3$faildateproc)) head(failuredf3.sum)> head(failuredf3.sum)FailDate Fails 1 2016-09-27 1 2 2016-10-20 1 3 2016-11-09 1 4 2016-11-18 1 5 2016-11-22 1 6 2016-12-01 1 (this is not aggregating at all) It seems that the aggregate() function doesn't work with actual dates. I want to do the following: failure_ts <- ts(failuredf3.sum$Fails, start=c(year(failuredf3.sum[1,1]), month(failuredf3.sum[1,1)), end=c(year(failuredf3.sum[nrow(failuredf3.sum),1],month(failuredf3.sum[nrow(failuredf3),1])), frequency=12) I basically want to arrange my data so that I can have the count of failures grouped by YYYY-MM period and then be able to extract the year and month automatically when using the ts object, so I do not have to do it manually. Any help and/or guidance will be greatly appreciated. Below the complete code with the issue I am experiencing: library("vcd") library("stats") library("fitdistrplus") library("actuar") library("betafunctions") library("extraDistr") library("gamlss.dist") library("mbbefd") library("VGAM") library("LaplacesDemon") library("GB2") library("BMT") library("bridgedist") library("ExtDist") library("Pareto") library("CircStats") library("circular") library("lmomco") library("sgt") library("ChernoffDist") library("Runuran") library("emdbook") library("extraDistr") library("sadists") library("rmutil") library("cbinom") library("Davies") library("nCDunnett") library("AEP") library("tolerance") library("mixSPE") library("poweRlaw") library("ReIns") library("normalp") library("SMR") library("fpow") library("flexsurv") library("RTDE") library("distributionsrd") library("evd") library("SuppDists") library("EnvStats") library("ghyp") library("CaDENCE") library("VarianceGamma") library("MCMCpack") library("coga") library("ggamma") library("distTails") library("sglg") library("ollggamma") library("truncnorm") library("greybox") library("csn") library("CompQuadForm") library("NormalLaplace") library("evir") library("FAdist") library("TLMoments") library("QRM") library("ROOPSD") library("fExtremes") library("revdbayes") library("dgumbel") library("bgumbel") library("ugomquantreg") library("smoothmest") library("marg") library("fBasics") library("GeneralizedHyperbolic") library("HyperbolicDist") library("SkewHyperbolic") library("FatTailsR") library("statmod") library("SuppDists") library("GIGrvg") library("ForestFit") library("elfDistr") library("ecd") library("gld") library("ald") library("distr") library("gb") library("LambertW") library("L1pack") library("gambin") library("sld") library("SCI") library("glogis") library("GTDL") library("logitnorm") library("loglognorm") library("minimax") library("MittagLeffleR") library("extremefit") library("qrmtools") library("Renext") library("ParetoPosStable") library("PhaseTypeR") library("mapfit") library("matrixdist") library("stabledist") library("stable") library("PearsonDS") library("rtdists") library("MixedTS") library("crch") library("skewt") library("FMStable") library("SymTS") library("sn") library("triangle") library("mc2d") library("trapezoid") library("tsallisqexp") library("tweedie") library("KScorrect") library("vasicek") library("vasicekreg") library("rvMF") library("WienR") library("fBasics") library("gamlss") library("gamlss.add") library("univariateML") library("readxl") library("xlsx") library("MASS") library("invgamma") library("dplyr") library("lubridate") library("tidyverse") library("forecast") library("glmnet") library("neuralnet") library("nnfor") library("smooth") library("tseries") library("greybox") library("caret") library("lattice") library("doParallel") library("pdp") library("readxl") library("mixtools") library("pivottabler") library("arsenal") library("kableExtra") library("janitor") library("forecastHybrid") #getting current working directory getwd() #setting up desired working directory setwd("C:/Users/PaulBernal") #reading data for distribution fitting failuredf <- as.data.frame(read_excel("FailureData.xlsx")) failuredf2 <- subset(failuredf, !is.na(FAILDATE)) failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){ failuredf2$Fails = 1 }else{ failuredf2$Fails = 0 } #mapply(format, failuredf2$FAILDATE, tz = "America/Bogota") failuredf2["faildateproc"] = format(failuredf2$FAILDATE, format='%Y-%m') failuredf3 <- select(failuredf2, faildateproc, Fails) head(failuredf3) failuredf3.sum <- aggregate(x = failuredf3[c("Fails")], FUN = sum, by = list(FailDate = failuredf3$faildateproc)) head(failuredf3.sum) failuredf3.sum %>% arrange(ym(failuredf3.sum$FailDate)) failuredf3.sum fail_ts_t <- ts(failuredf3.sum$Fails, start=c(year(failuredf3.sum[1,1]), month(failuredf3.sum[1,1])), end = c(2023,9), frequency = 12) Error in as.POSIXlt.character(x, tz = tz(x)) : character string is not in a standard unambiguous format Kind regards, Paul [[alternative HTML version deleted]]