On Oct 9, 2015, at 4:21 PM, Boris Steipe wrote:> I think you are going into the wrong direction here and this is a classical example of what we mean by "technical debt" of code. Rather than tell to your regular expression what you are looking for, you are handling special cases with redundant code. This is ugly, brittle and impossible to maintain. > > Respect to you that you have recognized this. > > > The solution is rather simple: > > A) Isolate tokens. Your IDs contain only a limited set of characters. Split your strings along the characters that are not found in IDs to isolate candidate tokens, place them into a vector. > > B) Evaluate your tokens: as far as I can see IDs all contain letters AND numbers. This is a unique characteristic. Thus it is sufficient to grep for a letter/number pair in a token to identify it as an ID. > > Should you ever find a need to accommodate differently formed IDs, there are only two, well defined places with clearly delegated roles where changes might be needed. > > Here is the code: > > for (i in 1:nrow(ripley.tv)) { > v <- unlist(strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+")) # isolate tokens > ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] # identify IDs and store > }That logic actually simplifies the regex strategy as well: sub("(.*[ \n])([-A-Z0-9]{6,12})(.*)", "\\2", ripley.tv$producto, ignore.case = T) Almost succeeds, with a few all-character words, but if you require one number in the middle you get full results: sub("(.*[ \n])([-A-Z0-9]{3,6}[0-9][-A-Z0-9]{2,6})(.*)", "\\2", ripley.tv$producto, ignore.case = T) [1] "48J6400" "40J5300" "TC-40CS600L" "LE28F6600" "LE40K5000N" [6] "LE32B7000" "LE32K5000N" "LE55B8000" "LE40B8000" "LE24B8000" [11] "TC-42AS610" "LE50K5000N" "40JU6500" "48JU6500" "50JU6500" [16] "55JS9000" "55JU6500" "55JU6700" "55JU7500" "65JS9000" [21] "65JU6500" "65JU7500" "75JU6500" "40LF6350" "42LF6400" [26] "42LF6450" "49LF6450" "LF6400" "43UF6750" "49UF6750" [31] "UF6900" "49UF7700" "49UF8500" "55UF7700" "65UF7700" [36] "55UF8500" "TC-55CX640W" "TC-50CX640W" "70UF7700" "UG8700" [41] "LF6350" "KDL-50FA95C" "KDL50W805C" "KDL-40R354B" "40J5500" [46] "50J5500" "32JH4005" "50J5300" "48J5300" "40J6400" [51] "KDL-32R505C" "KDL-40R555C" "55J6400" "40JH5005" "43LF5410" [56] "32LF585B" "49LF5900" "KDL-65W855C" "UN48J6500" "LE40F1551" [61] "TC-32AS600L" "KDL-32R304B" "55EC9300" "LE32W454F" "58UF8300" [66] "KDL-55W805C" "XBR-49X835C" "XBR-55X855C" "XBR-65X905C" "XBR-75X945C" [71] "XBR-55X905C" "LC60UE30U" "LC70UE30U" "LC80UE30U" "48J5500" [76] "79UG8800" "65UF9500" "65UF8500" "55UF9500" "32J4300" [81] "KDL-48R555C" "55UG8700" "60UF8500" "55LF6500" "32LF550B" [86] "47LB5610" "TC-50AS600L" "XBR-55X855B" "LC70SQ17U" "XBR-79X905B" [91] "TC-40A400L" "XBR-70X855B" "55HU8700" "LE40D3142" "TC-42AS650L" [96] "LC70LE660" "LE58D3140"> > > > Cheers, > Boris > > > > On Oct 9, 2015, at 5:48 PM, Omar Andr? Gonz?les D?az <oma.gonzales at gmail.com> wrote: > >>>>> ripley.tv <- structure(list(id = c(NA, NA, NA, NA, NA, NA, NA, NA, >>> NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, >>>>> NA, NA, NA, NA, NA, NA, NA), marca = c("SAMSUNG", "SAMSUNG", >>>>> "PANASONIC", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", >>>>> "HAIER", "PANASONIC", "HAIER", "SAMSUNG", "SAMSUNG", "SAMSUNG", >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", >>>>> "SAMSUNG", "SAMSUNG", "LG", "LG", "LG", "LG", "LG", "LG", "LG", >>>>> "LG", "LG", "LG", "LG", "LG", "LG", "PANASONIC", "PANASONIC", >>>>> "LG", "LG", "LG", "SONY", "SONY", "SONY", "SAMSUNG", "SAMSUNG", >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SONY", "SONY", "SAMSUNG", >>>>> "SAMSUNG", "LG", "LG", "LG", "SONY", "SAMSUNG", "AOC", "PANASONIC", >>>>> "SONY", "LG", "AOC", "LG", "SONY", "SONY", "SONY", "SONY", "SONY", >>>>> "SONY", "SHARP", "SHARP", "SHARP", "SAMSUNG", "LG", "LG", "LG", >>>>> "LG", "SAMSUNG", "SONY", "LG", "LG", "LG", "LG", "LG", "PANASONIC", >>>>> "SONY", "SHARP", "SONY", "PANASONIC", "SONY", "SAMSUNG", "AOC", >>>>> "PANASONIC", "SHARP", "AOC"), producto = c("SMART TV LED FHD 48\" 3D >>>>> 48J6400", >>>>> "SMART TV LED FHD 40\" 40J5300", "TV LED FULL HD 40'' TC-40CS600L", >>>>> "TELEVISOR LED LE28F6600 28\"", "SMART TV 40\" HD LE40K5000N", >>>>> "TV LED HD 32'' LE32B7000", "SMART TV 32'' LE32K5000N", "TV LED FHD >>> 55\" - >>>>> LE55B8000", >>>>> "TV LED LE40B8000 FULL HD 40\"", "TV LE24B8000 LED HD 24\" - NEGRO", >>>>> "TV LED FULL HD 42'' TC-42AS610", "TELEVISOR LED LE50K5000N 50\"", >>>>> "SMART TV LED UHD 40\" 40JU6500", "SMART TV ULTRA HD 48'' 48JU6500", >>>>> "SMART TV 50JU6500 LED UHD 50\" - NEGRO", "SMART TV ULTRA HD 55'' 3D >>>>> 55JS9000", >>>>> "SMART TV LED UHD 55\" 55JU6500", "SMART TV ULTRA HD 55'' 55JU6700", >>>>> "SMART TV CURVO 55JU7500 LED UHD 55\" 3D - NEGRO", "SMART TV ULTRA HD >>> 65'' >>>>> 3D 65JS9000", >>>>> "SMART TV 65JU6500 LED UHD 65\"", "SMART TV ULTRA HD 65'' 65JU7500", >>>>> "SMART TV LED UHD 75\" 75JU6500", "SMART TV WEB OS 40\" FULL HD >>> 40LF6350", >>>>> "SMART TV 3D 42\" FULL HD 42LF6400", "TV LED 42\" FULL HD CINEMA 3D >>>>> 42LF6450", >>>>> "TV LED 49\" FULL HD CINEMA 3D 49LF6450", "SMART TV LF6400 49\" FULL HD >>>>> 3D", >>>>> "TV 43UF6750 43\" ULTRA HD 4K", "TV 49\" ULTRA HD 4K 49UF6750", >>>>> "TV LED 49\" ULTRA HD SMART UF6900", "SMART TV 49UF7700 49\" ULTRA HD >>> 4K", >>>>> "SMART TV 49UF8500 49\" ULTRA HD 4K 3D", "TV LED 55\" CINEMA 3D SMART >>> TV >>>>> 55UF7700", >>>>> "SMART TV 65UF7700 65\" ULTRA HD 4K", "SMART TV 55UF8500 55\" ULTRA HD >>> 4K >>>>> 3D", >>>>> "TV LED 55\" ULTRA HD 4K SMART TC-55CX640W", "TV LED 50\" ULTRA HD 4K >>> SMART >>>>> TC-50CX640W", >>>>> "SMART TV 70UF7700 3D ULTRA HD 70\"", "TV LED CURVO 65\" ULTRA HD 4K >>> CINEMA >>>>> SMART UG8700", >>>>> "TV LED 60\" FULL HD SMART LF6350", "SMART TV KDL-50FA95C 50\" FULL HD >>> 3D", >>>>> "SMART TV KDL50W805C 50\" FULL HD 3D", "TV LED 40\" FULL HD >>> KDL-40R354B", >>>>> "SMART TV LED FULL HD 40'' 40J5500", "SMART TV LED FULL HD 50'' >>> 50J5500", >>>>> "TV LED HD 32'' 32JH4005", "SMART TV LED FULL HD 50\" 50J5300", >>>>> "SMART TV LED 48\" FULL HD 48J5300", "SMART TV FULL HD 40'' 3D >>> 40J6400", >>>>> "TV LED 32\" HD SMART KDL-32R505C", "TV LED 40\" SMART FULL HD >>> KDL-40R555C >>>>> - NEGRO", >>>>> "SMART TV LED FHD 55\" 3D 55J6400", "TV 40JH5005 LED FHD 40\" - NEGRO", >>>>> "TV 43\" FULL HD 43LF5410", "SMART TV 32LF585B LED HD 32\" - BLANCO", >>>>> "TV LED 49\" FULL HD SMART 49LF5900", "SMART TV 65\" FULL HD 3D >>>>> KDL-65W855C", >>>>> "SMART TV LED FHD 48\" UN48J6500", "TV LED 40\" FULL HD LE40F1551", >>>>> "TV LED 32'' SMART HD TC-32AS600L", "TV LED 32'' HD KDL-32R304B", >>>>> "TV OLED 55\" SMART 3D FULL HD 55EC9300 PLATEADO", "TV LED HD 32'' >>>>> LE32W454F", >>>>> "TV LED 58\" ULTRA HD SMART 58UF8300", "TV LED 55\" FULL HD SMART 3D >>>>> KDL-55W805C", >>>>> "TV LED 49\" ULTRA HD 4K XBR-49X835C", "TV LED 55\" ULTRA HD 4K >>>>> XBR-55X855C", >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-65X905C", "TV LED 75\" >>> ULTRA HD >>>>> 4K 3D XBR-75X945C", >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-55X905C", "SMART TV LED 60'' >>>>> ULTRA HD 4K LC60UE30U", >>>>> "SMART TV LED 70'' ULTRA HD 4K LC70UE30U", "SMART TV LED 80'' ULTRA HD >>> 4K >>>>> LC80UE30U", >>>>> "SMART TV LED FULL HD 48'' 48J5500", "SMART TV CURVO 79UG8800 79\" >>> ULTRA HD >>>>> 4K 3D", >>>>> "SMART TV 65UF9500 65\" ULTRA HD 4K 3D", "SMART TV 65UF8500 65\" ULTRA >>> HD >>>>> 4K 3D", >>>>> "SMART TV 55UF9500 55\" ULTRA HD 4K 3D", "SMART TV LED HD 32\" >>> 32J4300", >>>>> "TV LED 48\" SMART FULL HD KDL-48R555C - NEGRO", "SMART TV 55UG8700 >>> 55\" >>>>> ULTRA HD 4K 3D", >>>>> "SMART TV 60UF8500 60\" ULTRA HD 4K 3D", "SMART TV 55LF6500 55\" FULL >>> HD >>>>> 3D", >>>>> "TV 32LF550B 32\" HD", "TV LED 47\" FULL HD 47LB5610", "TV LED FULL HD >>> 50'' >>>>> TC-50AS600L", >>>>> "TV SMART LED 55\" UHD 3D XBR-55X855B", "TV LED FULL HD 4K LC70SQ17U >>> 70''", >>>>> "TV LED SMART UHD 79\" XBR-79X905B", "TV LED FULL HD 40'' TC-40A400L", >>>>> "TV LED SMART UHD 70\" XBR-70X855B", "SMART TV UHD 55'' 3D CURVO >>> 55HU8700", >>>>> "TV FULL HD LE40D3142 40\" - NEGRO", "TELEVISOR LED 42\" TC-42AS650L", >>>>> "SMART TV LCD FHD 70\" LC70LE660", "TV LED FULL HD 58'' LE58D3140" >>>>> ), pulgadas = c(48L, 40L, 40L, 28L, 40L, 32L, 32L, 55L, 40L, >>>>> 24L, 42L, 50L, 40L, 48L, 50L, 55L, 55L, 55L, 55L, 65L, 65L, 65L, >>>>> 75L, 40L, 42L, 42L, 49L, 49L, 43L, 49L, 49L, 49L, 49L, 55L, 65L, >>>>> 55L, 55L, 50L, 70L, 65L, 60L, 50L, 50L, 40L, 40L, 50L, 32L, 50L, >>>>> 48L, 40L, 32L, 40L, 55L, 40L, 43L, 32L, 49L, 65L, 48L, 40L, 32L, >>>>> 32L, 55L, 32L, 58L, 55L, 49L, 55L, 55L, 75L, 55L, 60L, 70L, 80L, >>>>> 48L, 79L, 65L, 65L, 55L, 32L, 48L, 55L, 60L, 55L, 32L, 47L, 50L, >>>>> 55L, 70L, 79L, 40L, 70L, 55L, 40L, 42L, 70L, 58L), precio.antes >>> c(2799L, >>>>> 1799L, 1699L, 599L, 1299L, 699L, 999L, 1999L, 999L, 499L, 1899L, >>>>> 1799L, 2499L, 3999L, 3699L, 10999L, 4299L, 5499L, 6999L, 14999L, >>>>> 8999L, 9999L, 14599L, 1999L, 2299L, 2299L, 2899L, 2999L, 2299L, >>>>> 23992L, 3599L, 3799L, 4799L, 4999L, 8499L, 5999L, 4999L, 3999L, >>>>> 11999L, 10999L, 4399L, 4499L, 3799L, 1399L, 2299L, 2799L, 999L, >>>>> 2199L, 2299L, 2299L, 1299L, 1699L, 3499L, 1399L, 1549L, 1299L, >>>>> 2399L, 6499L, 2999L, 999L, 1249L, 999L, 14999L, 799L, 5999L, >>>>> 4499L, 4999L, 6499L, 12999L, 24999L, 8999L, 5999L, 7599L, 14999L, >>>>> 2499L, 29999L, 13999L, 9999L, 9699L, 1299L, 2399L, 6999L, 7999L, >>>>> 3699L, 999L, 1899L, 2999L, 7999L, 8499L, 24999L, 1399L, 13999L, >>>>> 8499L, 999L, 2599L, 5799L, 2399L), precio.nuevo = c(2299, 1399, >>>>> 1299, 549, 1099, 629, 799, 1699, 849, 439, 1499, 1549, 1759.2, >>>>> 2099.3, 2309.3, 7699.3, 2799.3, 3639.3, 4899.3, 10499.3, 5109.3, >>>>> 6999.3, 10219.3, 1399, 1599, 1599, 2199, 2199, 1299, 23992, 2299, >>>>> 2299, 2899, 2999, 5999, 3899, 4999, 3999, 8999, 6999, 4099, 3999, >>>>> 3499, 1299, 1799, 2399, 799, 2199, 1799, 1999, 1199, 1599, 2999, >>>>> 1199, 1399, 1099, 1999, 5999, 2799, 999, 1199, 949, 7999, 799, >>>>> 5299, 4299, 3999, 5999, 11999, 23999, 7999, 5699, 7599, 14499, >>>>> 2399, 29999, 11999, 8999, 7499, 1099, 2199, 6599, 7099, 3599, >>>>> 899, 1599, 2199, 4999, 6499, 19999, 1399, 9999, 5999, 999, 2599, >>>>> 5699, 2399), dif.precios = c(500, 400, 400, 50, 200, 70, 200, >>>>> 300, 150, 60, 400, 250, 739.8, 1899.7, 1389.7, 3299.7, 1499.7, >>>>> 1859.7, 2099.7, 4499.7, 3889.7, 2999.7, 4379.7, 600, 700, 700, >>>>> 700, 800, 1000, 0, 1300, 1500, 1900, 2000, 2500, 2100, 0, 0, >>>>> 3000, 4000, 300, 500, 300, 100, 500, 400, 200, 0, 500, 300, 100, >>>>> 100, 500, 200, 150, 200, 400, 500, 200, 0, 50, 50, 7000, 0, 700, >>>>> 200, 1000, 500, 1000, 1000, 1000, 300, 0, 500, 100, 0, 2000, >>>>> 1000, 2200, 200, 200, 400, 900, 100, 100, 300, 800, 3000, 2000, >>>>> 5000, 0, 4000, 2500, 0, 0, 100, 0), dif.porcentual = c(17.86, >>>>> 22.23, 23.54, 8.35, 15.4, 10.01, 20.02, 15.01, 15.02, 12.02, >>>>> 21.06, 13.9, 29.6, 47.5, 37.57, 30, 34.88, 33.82, 30, 30, 43.22, >>>>> 30, 30, 30.02, 30.45, 30.45, 24.15, 26.68, 43.5, 0, 36.12, 39.48, >>>>> 39.59, 40.01, 29.42, 35.01, 0, 0, 25, 36.37, 6.82, 11.11, 7.9, >>>>> 7.15, 21.75, 14.29, 20.02, 0, 21.75, 13.05, 7.7, 5.89, 14.29, >>>>> 14.3, 9.68, 15.4, 16.67, 7.69, 6.67, 0, 4, 5.01, 46.67, 0, 11.67, >>>>> 4.45, 20, 7.69, 7.69, 4, 11.11, 5, 0, 3.33, 4, 0, 14.29, 10, >>>>> 22.68, 15.4, 8.34, 5.72, 11.25, 2.7, 10.01, 15.8, 26.68, 37.5, >>>>> 23.53, 20, 0, 28.57, 29.42, 0, 0, 1.72, 0), rangos = c("S/.1500 - >>> S/.2500", >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - >>>>> S/.1500", >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", >>>>> "S/.500 - S/.1500", "< S/.500", "S/.500 - S/.1500", "S/.1500 - >>> S/.2500", >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.3500 - S/.4500", "> S/.4,500", >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "S/.500 - >>> S/.1500", >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "> S/.4,500", "S/.1500 - >>> S/.2500", >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.2500 - S/.3500", >>>>> "> S/.4,500", "S/.3500 - S/.4500", "> S/.4,500", "S/.3500 - S/.4500", >>>>> "> S/.4,500", "> S/.4,500", "S/.3500 - S/.4500", "S/.3500 - S/.4500", >>>>> "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.1500 - S/.2500", >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "S/.1500 - S/.2500", >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.500 - S/.1500", >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.500 - S/.1500", >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.500 - >>> S/.1500", >>>>> "S/.500 - S/.1500", "> S/.4,500", "S/.500 - S/.1500", "> S/.4,500", >>>>> "S/.3500 - S/.4500", "S/.3500 - S/.4500", "> S/.4,500", "> S/.4,500", >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", >>>>> "> S/.4,500", "S/.500 - S/.1500", "S/.1500 - S/.2500", "> S/.4,500", >>>>> "> S/.4,500", "S/.3500 - S/.4500", "S/.500 - S/.1500", "S/.1500 - >>> S/.2500", >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", >>>>> "S/.500 - S/.1500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", >>>>> "S/.2500 - S/.3500", "> S/.4,500", "S/.1500 - S/.2500")), .Names >>> c("id", >>>>> "marca", "producto", "pulgadas", "precio.antes", "precio.nuevo", >>>>> "dif.precios", "dif.porcentual", "rangos"), class = "data.frame", >>> row.names >>>>> = c(NA, >>>>> -97L)) > > ______________________________________________ > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.David Winsemius Alameda, CA, USA
David, Boris, so thankfull for your help. Both approaches are very good. I got this solve with David's help. I find very insteresting Bori's for loop. And I need a little help understanding the regex part on it. - The strsplit function: strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+") I understand for this: split every row by a sequence of any number or letter or "-" that appears at leat once (+ operator). 1.- What does mena the "^" symbol? If you remove it, just appeare blanks. 2.- Why is there the necessity of "+" after the closing "]"? 3.- How this: ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] Identifies also the IDs where "-" is present. Here the regex does not have the "-" included. Also, I notice that David used the "-" at the begining of the matching: [-A-Z0-9], without the "^" (stars with) at the beginning. I would appreciate a response from you, gentlemen. Thanks again. 2015-10-09 18:32 GMT-05:00 David Winsemius <dwinsemius at comcast.net>:> > On Oct 9, 2015, at 4:21 PM, Boris Steipe wrote: > > > I think you are going into the wrong direction here and this is a > classical example of what we mean by "technical debt" of code. Rather than > tell to your regular expression what you are looking for, you are handling > special cases with redundant code. This is ugly, brittle and impossible to > maintain. > > > > Respect to you that you have recognized this. > > > > > > The solution is rather simple: > > > > A) Isolate tokens. Your IDs contain only a limited set of characters. > Split your strings along the characters that are not found in IDs to > isolate candidate tokens, place them into a vector. > > > > B) Evaluate your tokens: as far as I can see IDs all contain letters AND > numbers. This is a unique characteristic. Thus it is sufficient to grep for > a letter/number pair in a token to identify it as an ID. > > > > Should you ever find a need to accommodate differently formed IDs, there > are only two, well defined places with clearly delegated roles where > changes might be needed. > > > > Here is the code: > > > > for (i in 1:nrow(ripley.tv)) { > > v <- unlist(strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+")) # > isolate tokens > > ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] # identify IDs and > store > > } > > That logic actually simplifies the regex strategy as well: > > sub("(.*[ \n])([-A-Z0-9]{6,12})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > > Almost succeeds, with a few all-character words, but if you require one > number in the middle you get full results: > > sub("(.*[ \n])([-A-Z0-9]{3,6}[0-9][-A-Z0-9]{2,6})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > [1] "48J6400" "40J5300" "TC-40CS600L" "LE28F6600" "LE40K5000N" > [6] "LE32B7000" "LE32K5000N" "LE55B8000" "LE40B8000" "LE24B8000" > [11] "TC-42AS610" "LE50K5000N" "40JU6500" "48JU6500" "50JU6500" > [16] "55JS9000" "55JU6500" "55JU6700" "55JU7500" "65JS9000" > [21] "65JU6500" "65JU7500" "75JU6500" "40LF6350" "42LF6400" > [26] "42LF6450" "49LF6450" "LF6400" "43UF6750" "49UF6750" > [31] "UF6900" "49UF7700" "49UF8500" "55UF7700" "65UF7700" > [36] "55UF8500" "TC-55CX640W" "TC-50CX640W" "70UF7700" "UG8700" > [41] "LF6350" "KDL-50FA95C" "KDL50W805C" "KDL-40R354B" "40J5500" > [46] "50J5500" "32JH4005" "50J5300" "48J5300" "40J6400" > [51] "KDL-32R505C" "KDL-40R555C" "55J6400" "40JH5005" "43LF5410" > [56] "32LF585B" "49LF5900" "KDL-65W855C" "UN48J6500" "LE40F1551" > [61] "TC-32AS600L" "KDL-32R304B" "55EC9300" "LE32W454F" "58UF8300" > [66] "KDL-55W805C" "XBR-49X835C" "XBR-55X855C" "XBR-65X905C" "XBR-75X945C" > [71] "XBR-55X905C" "LC60UE30U" "LC70UE30U" "LC80UE30U" "48J5500" > [76] "79UG8800" "65UF9500" "65UF8500" "55UF9500" "32J4300" > [81] "KDL-48R555C" "55UG8700" "60UF8500" "55LF6500" "32LF550B" > [86] "47LB5610" "TC-50AS600L" "XBR-55X855B" "LC70SQ17U" "XBR-79X905B" > [91] "TC-40A400L" "XBR-70X855B" "55HU8700" "LE40D3142" "TC-42AS650L" > [96] "LC70LE660" "LE58D3140" > > > > > > > > > Cheers, > > Boris > > > > > > > > On Oct 9, 2015, at 5:48 PM, Omar Andr? Gonz?les D?az < > oma.gonzales at gmail.com> wrote: > > > >>>>> ripley.tv <- structure(list(id = c(NA, NA, NA, NA, NA, NA, NA, NA, > >>> NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA), marca = c("SAMSUNG", "SAMSUNG", > >>>>> "PANASONIC", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", > >>>>> "HAIER", "PANASONIC", "HAIER", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "LG", "LG", "LG", "LG", "LG", "LG", "LG", > >>>>> "LG", "LG", "LG", "LG", "LG", "LG", "PANASONIC", "PANASONIC", > >>>>> "LG", "LG", "LG", "SONY", "SONY", "SONY", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SONY", "SONY", > "SAMSUNG", > >>>>> "SAMSUNG", "LG", "LG", "LG", "SONY", "SAMSUNG", "AOC", "PANASONIC", > >>>>> "SONY", "LG", "AOC", "LG", "SONY", "SONY", "SONY", "SONY", "SONY", > >>>>> "SONY", "SHARP", "SHARP", "SHARP", "SAMSUNG", "LG", "LG", "LG", > >>>>> "LG", "SAMSUNG", "SONY", "LG", "LG", "LG", "LG", "LG", "PANASONIC", > >>>>> "SONY", "SHARP", "SONY", "PANASONIC", "SONY", "SAMSUNG", "AOC", > >>>>> "PANASONIC", "SHARP", "AOC"), producto = c("SMART TV LED FHD 48\" 3D > >>>>> 48J6400", > >>>>> "SMART TV LED FHD 40\" 40J5300", "TV LED FULL HD 40'' TC-40CS600L", > >>>>> "TELEVISOR LED LE28F6600 28\"", "SMART TV 40\" HD LE40K5000N", > >>>>> "TV LED HD 32'' LE32B7000", "SMART TV 32'' LE32K5000N", "TV LED FHD > >>> 55\" - > >>>>> LE55B8000", > >>>>> "TV LED LE40B8000 FULL HD 40\"", "TV LE24B8000 LED HD 24\" - NEGRO", > >>>>> "TV LED FULL HD 42'' TC-42AS610", "TELEVISOR LED LE50K5000N 50\"", > >>>>> "SMART TV LED UHD 40\" 40JU6500", "SMART TV ULTRA HD 48'' 48JU6500", > >>>>> "SMART TV 50JU6500 LED UHD 50\" - NEGRO", "SMART TV ULTRA HD 55'' 3D > >>>>> 55JS9000", > >>>>> "SMART TV LED UHD 55\" 55JU6500", "SMART TV ULTRA HD 55'' 55JU6700", > >>>>> "SMART TV CURVO 55JU7500 LED UHD 55\" 3D - NEGRO", "SMART TV ULTRA HD > >>> 65'' > >>>>> 3D 65JS9000", > >>>>> "SMART TV 65JU6500 LED UHD 65\"", "SMART TV ULTRA HD 65'' 65JU7500", > >>>>> "SMART TV LED UHD 75\" 75JU6500", "SMART TV WEB OS 40\" FULL HD > >>> 40LF6350", > >>>>> "SMART TV 3D 42\" FULL HD 42LF6400", "TV LED 42\" FULL HD CINEMA 3D > >>>>> 42LF6450", > >>>>> "TV LED 49\" FULL HD CINEMA 3D 49LF6450", "SMART TV LF6400 49\" FULL > HD > >>>>> 3D", > >>>>> "TV 43UF6750 43\" ULTRA HD 4K", "TV 49\" ULTRA HD 4K 49UF6750", > >>>>> "TV LED 49\" ULTRA HD SMART UF6900", "SMART TV 49UF7700 49\" ULTRA HD > >>> 4K", > >>>>> "SMART TV 49UF8500 49\" ULTRA HD 4K 3D", "TV LED 55\" CINEMA 3D SMART > >>> TV > >>>>> 55UF7700", > >>>>> "SMART TV 65UF7700 65\" ULTRA HD 4K", "SMART TV 55UF8500 55\" ULTRA > HD > >>> 4K > >>>>> 3D", > >>>>> "TV LED 55\" ULTRA HD 4K SMART TC-55CX640W", "TV LED 50\" ULTRA HD 4K > >>> SMART > >>>>> TC-50CX640W", > >>>>> "SMART TV 70UF7700 3D ULTRA HD 70\"", "TV LED CURVO 65\" ULTRA HD 4K > >>> CINEMA > >>>>> SMART UG8700", > >>>>> "TV LED 60\" FULL HD SMART LF6350", "SMART TV KDL-50FA95C 50\" FULL > HD > >>> 3D", > >>>>> "SMART TV KDL50W805C 50\" FULL HD 3D", "TV LED 40\" FULL HD > >>> KDL-40R354B", > >>>>> "SMART TV LED FULL HD 40'' 40J5500", "SMART TV LED FULL HD 50'' > >>> 50J5500", > >>>>> "TV LED HD 32'' 32JH4005", "SMART TV LED FULL HD 50\" 50J5300", > >>>>> "SMART TV LED 48\" FULL HD 48J5300", "SMART TV FULL HD 40'' 3D > >>> 40J6400", > >>>>> "TV LED 32\" HD SMART KDL-32R505C", "TV LED 40\" SMART FULL HD > >>> KDL-40R555C > >>>>> - NEGRO", > >>>>> "SMART TV LED FHD 55\" 3D 55J6400", "TV 40JH5005 LED FHD 40\" - > NEGRO", > >>>>> "TV 43\" FULL HD 43LF5410", "SMART TV 32LF585B LED HD 32\" - BLANCO", > >>>>> "TV LED 49\" FULL HD SMART 49LF5900", "SMART TV 65\" FULL HD 3D > >>>>> KDL-65W855C", > >>>>> "SMART TV LED FHD 48\" UN48J6500", "TV LED 40\" FULL HD LE40F1551", > >>>>> "TV LED 32'' SMART HD TC-32AS600L", "TV LED 32'' HD KDL-32R304B", > >>>>> "TV OLED 55\" SMART 3D FULL HD 55EC9300 PLATEADO", "TV LED HD 32'' > >>>>> LE32W454F", > >>>>> "TV LED 58\" ULTRA HD SMART 58UF8300", "TV LED 55\" FULL HD SMART 3D > >>>>> KDL-55W805C", > >>>>> "TV LED 49\" ULTRA HD 4K XBR-49X835C", "TV LED 55\" ULTRA HD 4K > >>>>> XBR-55X855C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-65X905C", "TV LED 75\" > >>> ULTRA HD > >>>>> 4K 3D XBR-75X945C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-55X905C", "SMART TV LED > 60'' > >>>>> ULTRA HD 4K LC60UE30U", > >>>>> "SMART TV LED 70'' ULTRA HD 4K LC70UE30U", "SMART TV LED 80'' ULTRA > HD > >>> 4K > >>>>> LC80UE30U", > >>>>> "SMART TV LED FULL HD 48'' 48J5500", "SMART TV CURVO 79UG8800 79\" > >>> ULTRA HD > >>>>> 4K 3D", > >>>>> "SMART TV 65UF9500 65\" ULTRA HD 4K 3D", "SMART TV 65UF8500 65\" > ULTRA > >>> HD > >>>>> 4K 3D", > >>>>> "SMART TV 55UF9500 55\" ULTRA HD 4K 3D", "SMART TV LED HD 32\" > >>> 32J4300", > >>>>> "TV LED 48\" SMART FULL HD KDL-48R555C - NEGRO", "SMART TV 55UG8700 > >>> 55\" > >>>>> ULTRA HD 4K 3D", > >>>>> "SMART TV 60UF8500 60\" ULTRA HD 4K 3D", "SMART TV 55LF6500 55\" FULL > >>> HD > >>>>> 3D", > >>>>> "TV 32LF550B 32\" HD", "TV LED 47\" FULL HD 47LB5610", "TV LED FULL > HD > >>> 50'' > >>>>> TC-50AS600L", > >>>>> "TV SMART LED 55\" UHD 3D XBR-55X855B", "TV LED FULL HD 4K LC70SQ17U > >>> 70''", > >>>>> "TV LED SMART UHD 79\" XBR-79X905B", "TV LED FULL HD 40'' > TC-40A400L", > >>>>> "TV LED SMART UHD 70\" XBR-70X855B", "SMART TV UHD 55'' 3D CURVO > >>> 55HU8700", > >>>>> "TV FULL HD LE40D3142 40\" - NEGRO", "TELEVISOR LED 42\" > TC-42AS650L", > >>>>> "SMART TV LCD FHD 70\" LC70LE660", "TV LED FULL HD 58'' LE58D3140" > >>>>> ), pulgadas = c(48L, 40L, 40L, 28L, 40L, 32L, 32L, 55L, 40L, > >>>>> 24L, 42L, 50L, 40L, 48L, 50L, 55L, 55L, 55L, 55L, 65L, 65L, 65L, > >>>>> 75L, 40L, 42L, 42L, 49L, 49L, 43L, 49L, 49L, 49L, 49L, 55L, 65L, > >>>>> 55L, 55L, 50L, 70L, 65L, 60L, 50L, 50L, 40L, 40L, 50L, 32L, 50L, > >>>>> 48L, 40L, 32L, 40L, 55L, 40L, 43L, 32L, 49L, 65L, 48L, 40L, 32L, > >>>>> 32L, 55L, 32L, 58L, 55L, 49L, 55L, 55L, 75L, 55L, 60L, 70L, 80L, > >>>>> 48L, 79L, 65L, 65L, 55L, 32L, 48L, 55L, 60L, 55L, 32L, 47L, 50L, > >>>>> 55L, 70L, 79L, 40L, 70L, 55L, 40L, 42L, 70L, 58L), precio.antes > >>> c(2799L, > >>>>> 1799L, 1699L, 599L, 1299L, 699L, 999L, 1999L, 999L, 499L, 1899L, > >>>>> 1799L, 2499L, 3999L, 3699L, 10999L, 4299L, 5499L, 6999L, 14999L, > >>>>> 8999L, 9999L, 14599L, 1999L, 2299L, 2299L, 2899L, 2999L, 2299L, > >>>>> 23992L, 3599L, 3799L, 4799L, 4999L, 8499L, 5999L, 4999L, 3999L, > >>>>> 11999L, 10999L, 4399L, 4499L, 3799L, 1399L, 2299L, 2799L, 999L, > >>>>> 2199L, 2299L, 2299L, 1299L, 1699L, 3499L, 1399L, 1549L, 1299L, > >>>>> 2399L, 6499L, 2999L, 999L, 1249L, 999L, 14999L, 799L, 5999L, > >>>>> 4499L, 4999L, 6499L, 12999L, 24999L, 8999L, 5999L, 7599L, 14999L, > >>>>> 2499L, 29999L, 13999L, 9999L, 9699L, 1299L, 2399L, 6999L, 7999L, > >>>>> 3699L, 999L, 1899L, 2999L, 7999L, 8499L, 24999L, 1399L, 13999L, > >>>>> 8499L, 999L, 2599L, 5799L, 2399L), precio.nuevo = c(2299, 1399, > >>>>> 1299, 549, 1099, 629, 799, 1699, 849, 439, 1499, 1549, 1759.2, > >>>>> 2099.3, 2309.3, 7699.3, 2799.3, 3639.3, 4899.3, 10499.3, 5109.3, > >>>>> 6999.3, 10219.3, 1399, 1599, 1599, 2199, 2199, 1299, 23992, 2299, > >>>>> 2299, 2899, 2999, 5999, 3899, 4999, 3999, 8999, 6999, 4099, 3999, > >>>>> 3499, 1299, 1799, 2399, 799, 2199, 1799, 1999, 1199, 1599, 2999, > >>>>> 1199, 1399, 1099, 1999, 5999, 2799, 999, 1199, 949, 7999, 799, > >>>>> 5299, 4299, 3999, 5999, 11999, 23999, 7999, 5699, 7599, 14499, > >>>>> 2399, 29999, 11999, 8999, 7499, 1099, 2199, 6599, 7099, 3599, > >>>>> 899, 1599, 2199, 4999, 6499, 19999, 1399, 9999, 5999, 999, 2599, > >>>>> 5699, 2399), dif.precios = c(500, 400, 400, 50, 200, 70, 200, > >>>>> 300, 150, 60, 400, 250, 739.8, 1899.7, 1389.7, 3299.7, 1499.7, > >>>>> 1859.7, 2099.7, 4499.7, 3889.7, 2999.7, 4379.7, 600, 700, 700, > >>>>> 700, 800, 1000, 0, 1300, 1500, 1900, 2000, 2500, 2100, 0, 0, > >>>>> 3000, 4000, 300, 500, 300, 100, 500, 400, 200, 0, 500, 300, 100, > >>>>> 100, 500, 200, 150, 200, 400, 500, 200, 0, 50, 50, 7000, 0, 700, > >>>>> 200, 1000, 500, 1000, 1000, 1000, 300, 0, 500, 100, 0, 2000, > >>>>> 1000, 2200, 200, 200, 400, 900, 100, 100, 300, 800, 3000, 2000, > >>>>> 5000, 0, 4000, 2500, 0, 0, 100, 0), dif.porcentual = c(17.86, > >>>>> 22.23, 23.54, 8.35, 15.4, 10.01, 20.02, 15.01, 15.02, 12.02, > >>>>> 21.06, 13.9, 29.6, 47.5, 37.57, 30, 34.88, 33.82, 30, 30, 43.22, > >>>>> 30, 30, 30.02, 30.45, 30.45, 24.15, 26.68, 43.5, 0, 36.12, 39.48, > >>>>> 39.59, 40.01, 29.42, 35.01, 0, 0, 25, 36.37, 6.82, 11.11, 7.9, > >>>>> 7.15, 21.75, 14.29, 20.02, 0, 21.75, 13.05, 7.7, 5.89, 14.29, > >>>>> 14.3, 9.68, 15.4, 16.67, 7.69, 6.67, 0, 4, 5.01, 46.67, 0, 11.67, > >>>>> 4.45, 20, 7.69, 7.69, 4, 11.11, 5, 0, 3.33, 4, 0, 14.29, 10, > >>>>> 22.68, 15.4, 8.34, 5.72, 11.25, 2.7, 10.01, 15.8, 26.68, 37.5, > >>>>> 23.53, 20, 0, 28.57, 29.42, 0, 0, 1.72, 0), rangos = c("S/.1500 - > >>> S/.2500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - > >>>>> S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.500 - S/.1500", "< S/.500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.3500 - S/.4500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "S/.500 - > >>> S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "> S/.4,500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.2500 - S/.3500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "> S/.4,500", "S/.3500 - S/.4500", > >>>>> "> S/.4,500", "> S/.4,500", "S/.3500 - S/.4500", "S/.3500 - S/.4500", > >>>>> "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.500 - S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.500 - S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.500 - > >>> S/.1500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "S/.500 - S/.1500", "> S/.4,500", > >>>>> "S/.3500 - S/.4500", "S/.3500 - S/.4500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.500 - S/.1500", "S/.1500 - S/.2500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", > >>>>> "S/.2500 - S/.3500", "> S/.4,500", "S/.1500 - S/.2500")), .Names > >>> c("id", > >>>>> "marca", "producto", "pulgadas", "precio.antes", "precio.nuevo", > >>>>> "dif.precios", "dif.porcentual", "rangos"), class = "data.frame", > >>> row.names > >>>>> = c(NA, > >>>>> -97L)) > > > > ______________________________________________ > > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > > https://stat.ethz.ch/mailman/listinfo/r-help > > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > > and provide commented, minimal, self-contained, reproducible code. > > David Winsemius > Alameda, CA, USA > >[[alternative HTML version deleted]]
On Oct 9, 2015, at 9:38 PM, Omar Andr? Gonz?les D?az wrote:> David, Boris, so thankfull for your help. Both approaches are very good. I got this solve with David's help. > > I find very insteresting Bori's for loop. And I need a little help understanding the regex part on it. > > - The strsplit function: strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+") > > I understand for this: split every row by a sequence of any number or letter or "-" that appears at leat once (+ operator). > > 1.- What does mena the "^" symbol? If you remove it, just appeare blanks.Read the section of ?regex on character classes. A leading caret is a negation flag for the rest of the symbols inside the flanking "[]".> 2.- Why is there the necessity of "+" after the closing "]"?Might not be necessary but it means that double or tripled separators get handled as a single split.> > 3.- How this: ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] > Identifies also the IDs where "-" is present. Here the regex does not have the "-" included.<character>-<character> is a range within character classes.> > > Also, I notice that David used the "-" at the begining of the matching: [-A-Z0-9], without the "^" (stars with) at the beginning.Right. If it's the leading symbol, then "-" is interpreted as a literal rather than a range. -- David.> > I would appreciate a response from you, gentlemen. > > Thanks again. > > > > > > > > > > > > 2015-10-09 18:32 GMT-05:00 David Winsemius <dwinsemius at comcast.net>: > > On Oct 9, 2015, at 4:21 PM, Boris Steipe wrote: > > > I think you are going into the wrong direction here and this is a classical example of what we mean by "technical debt" of code. Rather than tell to your regular expression what you are looking for, you are handling special cases with redundant code. This is ugly, brittle and impossible to maintain. > > > > Respect to you that you have recognized this. > > > > > > The solution is rather simple: > > > > A) Isolate tokens. Your IDs contain only a limited set of characters. Split your strings along the characters that are not found in IDs to isolate candidate tokens, place them into a vector. > > > > B) Evaluate your tokens: as far as I can see IDs all contain letters AND numbers. This is a unique characteristic. Thus it is sufficient to grep for a letter/number pair in a token to identify it as an ID. > > > > Should you ever find a need to accommodate differently formed IDs, there are only two, well defined places with clearly delegated roles where changes might be needed. > > > > Here is the code: > > > > for (i in 1:nrow(ripley.tv)) { > > v <- unlist(strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+")) # isolate tokens > > ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] # identify IDs and store > > } > > That logic actually simplifies the regex strategy as well: > > sub("(.*[ \n])([-A-Z0-9]{6,12})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > > Almost succeeds, with a few all-character words, but if you require one number in the middle you get full results: > > sub("(.*[ \n])([-A-Z0-9]{3,6}[0-9][-A-Z0-9]{2,6})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > [1] "48J6400" "40J5300" "TC-40CS600L" "LE28F6600" "LE40K5000N" > [6] "LE32B7000" "LE32K5000N" "LE55B8000" "LE40B8000" "LE24B8000" > [11] "TC-42AS610" "LE50K5000N" "40JU6500" "48JU6500" "50JU6500" > [16] "55JS9000" "55JU6500" "55JU6700" "55JU7500" "65JS9000" > [21] "65JU6500" "65JU7500" "75JU6500" "40LF6350" "42LF6400" > [26] "42LF6450" "49LF6450" "LF6400" "43UF6750" "49UF6750" > [31] "UF6900" "49UF7700" "49UF8500" "55UF7700" "65UF7700" > [36] "55UF8500" "TC-55CX640W" "TC-50CX640W" "70UF7700" "UG8700" > [41] "LF6350" "KDL-50FA95C" "KDL50W805C" "KDL-40R354B" "40J5500" > [46] "50J5500" "32JH4005" "50J5300" "48J5300" "40J6400" > [51] "KDL-32R505C" "KDL-40R555C" "55J6400" "40JH5005" "43LF5410" > [56] "32LF585B" "49LF5900" "KDL-65W855C" "UN48J6500" "LE40F1551" > [61] "TC-32AS600L" "KDL-32R304B" "55EC9300" "LE32W454F" "58UF8300" > [66] "KDL-55W805C" "XBR-49X835C" "XBR-55X855C" "XBR-65X905C" "XBR-75X945C" > [71] "XBR-55X905C" "LC60UE30U" "LC70UE30U" "LC80UE30U" "48J5500" > [76] "79UG8800" "65UF9500" "65UF8500" "55UF9500" "32J4300" > [81] "KDL-48R555C" "55UG8700" "60UF8500" "55LF6500" "32LF550B" > [86] "47LB5610" "TC-50AS600L" "XBR-55X855B" "LC70SQ17U" "XBR-79X905B" > [91] "TC-40A400L" "XBR-70X855B" "55HU8700" "LE40D3142" "TC-42AS650L" > [96] "LC70LE660" "LE58D3140" > > > > > > > > > Cheers, > > Boris > > > > > > > > On Oct 9, 2015, at 5:48 PM, Omar Andr? Gonz?les D?az <oma.gonzales at gmail.com> wrote: > > > >>>>> ripley.tv <- structure(list(id = c(NA, NA, NA, NA, NA, NA, NA, NA, > >>> NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA), marca = c("SAMSUNG", "SAMSUNG", > >>>>> "PANASONIC", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", > >>>>> "HAIER", "PANASONIC", "HAIER", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "LG", "LG", "LG", "LG", "LG", "LG", "LG", > >>>>> "LG", "LG", "LG", "LG", "LG", "LG", "PANASONIC", "PANASONIC", > >>>>> "LG", "LG", "LG", "SONY", "SONY", "SONY", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SONY", "SONY", "SAMSUNG", > >>>>> "SAMSUNG", "LG", "LG", "LG", "SONY", "SAMSUNG", "AOC", "PANASONIC", > >>>>> "SONY", "LG", "AOC", "LG", "SONY", "SONY", "SONY", "SONY", "SONY", > >>>>> "SONY", "SHARP", "SHARP", "SHARP", "SAMSUNG", "LG", "LG", "LG", > >>>>> "LG", "SAMSUNG", "SONY", "LG", "LG", "LG", "LG", "LG", "PANASONIC", > >>>>> "SONY", "SHARP", "SONY", "PANASONIC", "SONY", "SAMSUNG", "AOC", > >>>>> "PANASONIC", "SHARP", "AOC"), producto = c("SMART TV LED FHD 48\" 3D > >>>>> 48J6400", > >>>>> "SMART TV LED FHD 40\" 40J5300", "TV LED FULL HD 40'' TC-40CS600L", > >>>>> "TELEVISOR LED LE28F6600 28\"", "SMART TV 40\" HD LE40K5000N", > >>>>> "TV LED HD 32'' LE32B7000", "SMART TV 32'' LE32K5000N", "TV LED FHD > >>> 55\" - > >>>>> LE55B8000", > >>>>> "TV LED LE40B8000 FULL HD 40\"", "TV LE24B8000 LED HD 24\" - NEGRO", > >>>>> "TV LED FULL HD 42'' TC-42AS610", "TELEVISOR LED LE50K5000N 50\"", > >>>>> "SMART TV LED UHD 40\" 40JU6500", "SMART TV ULTRA HD 48'' 48JU6500", > >>>>> "SMART TV 50JU6500 LED UHD 50\" - NEGRO", "SMART TV ULTRA HD 55'' 3D > >>>>> 55JS9000", > >>>>> "SMART TV LED UHD 55\" 55JU6500", "SMART TV ULTRA HD 55'' 55JU6700", > >>>>> "SMART TV CURVO 55JU7500 LED UHD 55\" 3D - NEGRO", "SMART TV ULTRA HD > >>> 65'' > >>>>> 3D 65JS9000", > >>>>> "SMART TV 65JU6500 LED UHD 65\"", "SMART TV ULTRA HD 65'' 65JU7500", > >>>>> "SMART TV LED UHD 75\" 75JU6500", "SMART TV WEB OS 40\" FULL HD > >>> 40LF6350", > >>>>> "SMART TV 3D 42\" FULL HD 42LF6400", "TV LED 42\" FULL HD CINEMA 3D > >>>>> 42LF6450", > >>>>> "TV LED 49\" FULL HD CINEMA 3D 49LF6450", "SMART TV LF6400 49\" FULL HD > >>>>> 3D", > >>>>> "TV 43UF6750 43\" ULTRA HD 4K", "TV 49\" ULTRA HD 4K 49UF6750", > >>>>> "TV LED 49\" ULTRA HD SMART UF6900", "SMART TV 49UF7700 49\" ULTRA HD > >>> 4K", > >>>>> "SMART TV 49UF8500 49\" ULTRA HD 4K 3D", "TV LED 55\" CINEMA 3D SMART > >>> TV > >>>>> 55UF7700", > >>>>> "SMART TV 65UF7700 65\" ULTRA HD 4K", "SMART TV 55UF8500 55\" ULTRA HD > >>> 4K > >>>>> 3D", > >>>>> "TV LED 55\" ULTRA HD 4K SMART TC-55CX640W", "TV LED 50\" ULTRA HD 4K > >>> SMART > >>>>> TC-50CX640W", > >>>>> "SMART TV 70UF7700 3D ULTRA HD 70\"", "TV LED CURVO 65\" ULTRA HD 4K > >>> CINEMA > >>>>> SMART UG8700", > >>>>> "TV LED 60\" FULL HD SMART LF6350", "SMART TV KDL-50FA95C 50\" FULL HD > >>> 3D", > >>>>> "SMART TV KDL50W805C 50\" FULL HD 3D", "TV LED 40\" FULL HD > >>> KDL-40R354B", > >>>>> "SMART TV LED FULL HD 40'' 40J5500", "SMART TV LED FULL HD 50'' > >>> 50J5500", > >>>>> "TV LED HD 32'' 32JH4005", "SMART TV LED FULL HD 50\" 50J5300", > >>>>> "SMART TV LED 48\" FULL HD 48J5300", "SMART TV FULL HD 40'' 3D > >>> 40J6400", > >>>>> "TV LED 32\" HD SMART KDL-32R505C", "TV LED 40\" SMART FULL HD > >>> KDL-40R555C > >>>>> - NEGRO", > >>>>> "SMART TV LED FHD 55\" 3D 55J6400", "TV 40JH5005 LED FHD 40\" - NEGRO", > >>>>> "TV 43\" FULL HD 43LF5410", "SMART TV 32LF585B LED HD 32\" - BLANCO", > >>>>> "TV LED 49\" FULL HD SMART 49LF5900", "SMART TV 65\" FULL HD 3D > >>>>> KDL-65W855C", > >>>>> "SMART TV LED FHD 48\" UN48J6500", "TV LED 40\" FULL HD LE40F1551", > >>>>> "TV LED 32'' SMART HD TC-32AS600L", "TV LED 32'' HD KDL-32R304B", > >>>>> "TV OLED 55\" SMART 3D FULL HD 55EC9300 PLATEADO", "TV LED HD 32'' > >>>>> LE32W454F", > >>>>> "TV LED 58\" ULTRA HD SMART 58UF8300", "TV LED 55\" FULL HD SMART 3D > >>>>> KDL-55W805C", > >>>>> "TV LED 49\" ULTRA HD 4K XBR-49X835C", "TV LED 55\" ULTRA HD 4K > >>>>> XBR-55X855C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-65X905C", "TV LED 75\" > >>> ULTRA HD > >>>>> 4K 3D XBR-75X945C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-55X905C", "SMART TV LED 60'' > >>>>> ULTRA HD 4K LC60UE30U", > >>>>> "SMART TV LED 70'' ULTRA HD 4K LC70UE30U", "SMART TV LED 80'' ULTRA HD > >>> 4K > >>>>> LC80UE30U", > >>>>> "SMART TV LED FULL HD 48'' 48J5500", "SMART TV CURVO 79UG8800 79\" > >>> ULTRA HD > >>>>> 4K 3D", > >>>>> "SMART TV 65UF9500 65\" ULTRA HD 4K 3D", "SMART TV 65UF8500 65\" ULTRA > >>> HD > >>>>> 4K 3D", > >>>>> "SMART TV 55UF9500 55\" ULTRA HD 4K 3D", "SMART TV LED HD 32\" > >>> 32J4300", > >>>>> "TV LED 48\" SMART FULL HD KDL-48R555C - NEGRO", "SMART TV 55UG8700 > >>> 55\" > >>>>> ULTRA HD 4K 3D", > >>>>> "SMART TV 60UF8500 60\" ULTRA HD 4K 3D", "SMART TV 55LF6500 55\" FULL > >>> HD > >>>>> 3D", > >>>>> "TV 32LF550B 32\" HD", "TV LED 47\" FULL HD 47LB5610", "TV LED FULL HD > >>> 50'' > >>>>> TC-50AS600L", > >>>>> "TV SMART LED 55\" UHD 3D XBR-55X855B", "TV LED FULL HD 4K LC70SQ17U > >>> 70''", > >>>>> "TV LED SMART UHD 79\" XBR-79X905B", "TV LED FULL HD 40'' TC-40A400L", > >>>>> "TV LED SMART UHD 70\" XBR-70X855B", "SMART TV UHD 55'' 3D CURVO > >>> 55HU8700", > >>>>> "TV FULL HD LE40D3142 40\" - NEGRO", "TELEVISOR LED 42\" TC-42AS650L", > >>>>> "SMART TV LCD FHD 70\" LC70LE660", "TV LED FULL HD 58'' LE58D3140" > >>>>> ), pulgadas = c(48L, 40L, 40L, 28L, 40L, 32L, 32L, 55L, 40L, > >>>>> 24L, 42L, 50L, 40L, 48L, 50L, 55L, 55L, 55L, 55L, 65L, 65L, 65L, > >>>>> 75L, 40L, 42L, 42L, 49L, 49L, 43L, 49L, 49L, 49L, 49L, 55L, 65L, > >>>>> 55L, 55L, 50L, 70L, 65L, 60L, 50L, 50L, 40L, 40L, 50L, 32L, 50L, > >>>>> 48L, 40L, 32L, 40L, 55L, 40L, 43L, 32L, 49L, 65L, 48L, 40L, 32L, > >>>>> 32L, 55L, 32L, 58L, 55L, 49L, 55L, 55L, 75L, 55L, 60L, 70L, 80L, > >>>>> 48L, 79L, 65L, 65L, 55L, 32L, 48L, 55L, 60L, 55L, 32L, 47L, 50L, > >>>>> 55L, 70L, 79L, 40L, 70L, 55L, 40L, 42L, 70L, 58L), precio.antes > >>> c(2799L, > >>>>> 1799L, 1699L, 599L, 1299L, 699L, 999L, 1999L, 999L, 499L, 1899L, > >>>>> 1799L, 2499L, 3999L, 3699L, 10999L, 4299L, 5499L, 6999L, 14999L, > >>>>> 8999L, 9999L, 14599L, 1999L, 2299L, 2299L, 2899L, 2999L, 2299L, > >>>>> 23992L, 3599L, 3799L, 4799L, 4999L, 8499L, 5999L, 4999L, 3999L, > >>>>> 11999L, 10999L, 4399L, 4499L, 3799L, 1399L, 2299L, 2799L, 999L, > >>>>> 2199L, 2299L, 2299L, 1299L, 1699L, 3499L, 1399L, 1549L, 1299L, > >>>>> 2399L, 6499L, 2999L, 999L, 1249L, 999L, 14999L, 799L, 5999L, > >>>>> 4499L, 4999L, 6499L, 12999L, 24999L, 8999L, 5999L, 7599L, 14999L, > >>>>> 2499L, 29999L, 13999L, 9999L, 9699L, 1299L, 2399L, 6999L, 7999L, > >>>>> 3699L, 999L, 1899L, 2999L, 7999L, 8499L, 24999L, 1399L, 13999L, > >>>>> 8499L, 999L, 2599L, 5799L, 2399L), precio.nuevo = c(2299, 1399, > >>>>> 1299, 549, 1099, 629, 799, 1699, 849, 439, 1499, 1549, 1759.2, > >>>>> 2099.3, 2309.3, 7699.3, 2799.3, 3639.3, 4899.3, 10499.3, 5109.3, > >>>>> 6999.3, 10219.3, 1399, 1599, 1599, 2199, 2199, 1299, 23992, 2299, > >>>>> 2299, 2899, 2999, 5999, 3899, 4999, 3999, 8999, 6999, 4099, 3999, > >>>>> 3499, 1299, 1799, 2399, 799, 2199, 1799, 1999, 1199, 1599, 2999, > >>>>> 1199, 1399, 1099, 1999, 5999, 2799, 999, 1199, 949, 7999, 799, > >>>>> 5299, 4299, 3999, 5999, 11999, 23999, 7999, 5699, 7599, 14499, > >>>>> 2399, 29999, 11999, 8999, 7499, 1099, 2199, 6599, 7099, 3599, > >>>>> 899, 1599, 2199, 4999, 6499, 19999, 1399, 9999, 5999, 999, 2599, > >>>>> 5699, 2399), dif.precios = c(500, 400, 400, 50, 200, 70, 200, > >>>>> 300, 150, 60, 400, 250, 739.8, 1899.7, 1389.7, 3299.7, 1499.7, > >>>>> 1859.7, 2099.7, 4499.7, 3889.7, 2999.7, 4379.7, 600, 700, 700, > >>>>> 700, 800, 1000, 0, 1300, 1500, 1900, 2000, 2500, 2100, 0, 0, > >>>>> 3000, 4000, 300, 500, 300, 100, 500, 400, 200, 0, 500, 300, 100, > >>>>> 100, 500, 200, 150, 200, 400, 500, 200, 0, 50, 50, 7000, 0, 700, > >>>>> 200, 1000, 500, 1000, 1000, 1000, 300, 0, 500, 100, 0, 2000, > >>>>> 1000, 2200, 200, 200, 400, 900, 100, 100, 300, 800, 3000, 2000, > >>>>> 5000, 0, 4000, 2500, 0, 0, 100, 0), dif.porcentual = c(17.86, > >>>>> 22.23, 23.54, 8.35, 15.4, 10.01, 20.02, 15.01, 15.02, 12.02, > >>>>> 21.06, 13.9, 29.6, 47.5, 37.57, 30, 34.88, 33.82, 30, 30, 43.22, > >>>>> 30, 30, 30.02, 30.45, 30.45, 24.15, 26.68, 43.5, 0, 36.12, 39.48, > >>>>> 39.59, 40.01, 29.42, 35.01, 0, 0, 25, 36.37, 6.82, 11.11, 7.9, > >>>>> 7.15, 21.75, 14.29, 20.02, 0, 21.75, 13.05, 7.7, 5.89, 14.29, > >>>>> 14.3, 9.68, 15.4, 16.67, 7.69, 6.67, 0, 4, 5.01, 46.67, 0, 11.67, > >>>>> 4.45, 20, 7.69, 7.69, 4, 11.11, 5, 0, 3.33, 4, 0, 14.29, 10, > >>>>> 22.68, 15.4, 8.34, 5.72, 11.25, 2.7, 10.01, 15.8, 26.68, 37.5, > >>>>> 23.53, 20, 0, 28.57, 29.42, 0, 0, 1.72, 0), rangos = c("S/.1500 - > >>> S/.2500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - > >>>>> S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.500 - S/.1500", "< S/.500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.3500 - S/.4500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "S/.500 - > >>> S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "> S/.4,500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.2500 - S/.3500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "> S/.4,500", "S/.3500 - S/.4500", > >>>>> "> S/.4,500", "> S/.4,500", "S/.3500 - S/.4500", "S/.3500 - S/.4500", > >>>>> "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.500 - S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.500 - S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.500 - > >>> S/.1500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "S/.500 - S/.1500", "> S/.4,500", > >>>>> "S/.3500 - S/.4500", "S/.3500 - S/.4500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.500 - S/.1500", "S/.1500 - S/.2500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", > >>>>> "S/.2500 - S/.3500", "> S/.4,500", "S/.1500 - S/.2500")), .Names > >>> c("id", > >>>>> "marca", "producto", "pulgadas", "precio.antes", "precio.nuevo", > >>>>> "dif.precios", "dif.porcentual", "rangos"), class = "data.frame", > >>> row.names > >>>>> = c(NA, > >>>>> -97L)) > > > > ______________________________________________ > > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > > https://stat.ethz.ch/mailman/listinfo/r-help > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > > and provide commented, minimal, self-contained, reproducible code. > > David Winsemius > Alameda, CA, USA > >David Winsemius Alameda, CA, USA
David answered most of this. Just a two short notes inline. On Oct 10, 2015, at 12:38 AM, Omar Andr? Gonz?les D?az <oma.gonzales at gmail.com> wrote:> David, Boris, so thankfull for your help. Both approaches are very good. I got this solve with David's help. > > I find very insteresting Bori's for loop. And I need a little help understanding the regex part on it. > > - The strsplit function: strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+") > > I understand for this: split every row by a sequence of any number or letter or "-" that appears at leat once (+ operator). > > 1.- What does mena the "^" symbol? If you remove it, just appeare blanks. > 2.- Why is there the necessity of "+" after the closing "]"? > > 3.- How this: ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] > Identifies also the IDs where "-" is present. Here the regex does not have the "-" included.Yes. I am not matching the entire token here. Note there is no "+": The two character-class expressions match exactly one uppercase character adjacent to exactly one number. If this is found in a token, grep returns TRUE. It doesn't matter what else the token contains - the first regex already took care of removing everything that's not needed. The vector of FALSEs and a single TRUE that grep() returns goes inside the square brackets, and selects the token from v.> Also, I notice that David used the "-" at the begining of the matching: [-A-Z0-9], without the "^" (stars with) at the beginning.This can be very confusing about regular expressions: the same character can mean different things depending on where it is found. Between two characters in a character class expresssion, the hyphen means "range". Elsewhere it is a literal hyphen. David put his at the beginning, I had it at the end (in the first regex). Another tricky character is "?" which can mean 0,1 matches, or turn "greedy" matching off... Online regex testers are invaluable to develop a regex - one I frequently use is regexpal.com Cheers, B.> > I would appreciate a response from you, gentlemen. > > Thanks again. > > > > > > > > > > > > 2015-10-09 18:32 GMT-05:00 David Winsemius <dwinsemius at comcast.net>: > > On Oct 9, 2015, at 4:21 PM, Boris Steipe wrote: > > > I think you are going into the wrong direction here and this is a classical example of what we mean by "technical debt" of code. Rather than tell to your regular expression what you are looking for, you are handling special cases with redundant code. This is ugly, brittle and impossible to maintain. > > > > Respect to you that you have recognized this. > > > > > > The solution is rather simple: > > > > A) Isolate tokens. Your IDs contain only a limited set of characters. Split your strings along the characters that are not found in IDs to isolate candidate tokens, place them into a vector. > > > > B) Evaluate your tokens: as far as I can see IDs all contain letters AND numbers. This is a unique characteristic. Thus it is sufficient to grep for a letter/number pair in a token to identify it as an ID. > > > > Should you ever find a need to accommodate differently formed IDs, there are only two, well defined places with clearly delegated roles where changes might be needed. > > > > Here is the code: > > > > for (i in 1:nrow(ripley.tv)) { > > v <- unlist(strsplit(ripley.tv$producto[i], "[^A-Z0-9-]+")) # isolate tokens > > ripley.tv$id[i] <- v[grep("[A-Z][0-9]", v)] # identify IDs and store > > } > > That logic actually simplifies the regex strategy as well: > > sub("(.*[ \n])([-A-Z0-9]{6,12})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > > Almost succeeds, with a few all-character words, but if you require one number in the middle you get full results: > > sub("(.*[ \n])([-A-Z0-9]{3,6}[0-9][-A-Z0-9]{2,6})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > [1] "48J6400" "40J5300" "TC-40CS600L" "LE28F6600" "LE40K5000N" > [6] "LE32B7000" "LE32K5000N" "LE55B8000" "LE40B8000" "LE24B8000" > [11] "TC-42AS610" "LE50K5000N" "40JU6500" "48JU6500" "50JU6500" > [16] "55JS9000" "55JU6500" "55JU6700" "55JU7500" "65JS9000" > [21] "65JU6500" "65JU7500" "75JU6500" "40LF6350" "42LF6400" > [26] "42LF6450" "49LF6450" "LF6400" "43UF6750" "49UF6750" > [31] "UF6900" "49UF7700" "49UF8500" "55UF7700" "65UF7700" > [36] "55UF8500" "TC-55CX640W" "TC-50CX640W" "70UF7700" "UG8700" > [41] "LF6350" "KDL-50FA95C" "KDL50W805C" "KDL-40R354B" "40J5500" > [46] "50J5500" "32JH4005" "50J5300" "48J5300" "40J6400" > [51] "KDL-32R505C" "KDL-40R555C" "55J6400" "40JH5005" "43LF5410" > [56] "32LF585B" "49LF5900" "KDL-65W855C" "UN48J6500" "LE40F1551" > [61] "TC-32AS600L" "KDL-32R304B" "55EC9300" "LE32W454F" "58UF8300" > [66] "KDL-55W805C" "XBR-49X835C" "XBR-55X855C" "XBR-65X905C" "XBR-75X945C" > [71] "XBR-55X905C" "LC60UE30U" "LC70UE30U" "LC80UE30U" "48J5500" > [76] "79UG8800" "65UF9500" "65UF8500" "55UF9500" "32J4300" > [81] "KDL-48R555C" "55UG8700" "60UF8500" "55LF6500" "32LF550B" > [86] "47LB5610" "TC-50AS600L" "XBR-55X855B" "LC70SQ17U" "XBR-79X905B" > [91] "TC-40A400L" "XBR-70X855B" "55HU8700" "LE40D3142" "TC-42AS650L" > [96] "LC70LE660" "LE58D3140" > > > > > > > > > Cheers, > > Boris > > > > > > > > On Oct 9, 2015, at 5:48 PM, Omar Andr? Gonz?les D?az <oma.gonzales at gmail.com> wrote: > > > >>>>> ripley.tv <- structure(list(id = c(NA, NA, NA, NA, NA, NA, NA, NA, > >>> NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > >>>>> NA, NA, NA, NA, NA, NA, NA), marca = c("SAMSUNG", "SAMSUNG", > >>>>> "PANASONIC", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", > >>>>> "HAIER", "PANASONIC", "HAIER", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "LG", "LG", "LG", "LG", "LG", "LG", "LG", > >>>>> "LG", "LG", "LG", "LG", "LG", "LG", "PANASONIC", "PANASONIC", > >>>>> "LG", "LG", "LG", "SONY", "SONY", "SONY", "SAMSUNG", "SAMSUNG", > >>>>> "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SONY", "SONY", "SAMSUNG", > >>>>> "SAMSUNG", "LG", "LG", "LG", "SONY", "SAMSUNG", "AOC", "PANASONIC", > >>>>> "SONY", "LG", "AOC", "LG", "SONY", "SONY", "SONY", "SONY", "SONY", > >>>>> "SONY", "SHARP", "SHARP", "SHARP", "SAMSUNG", "LG", "LG", "LG", > >>>>> "LG", "SAMSUNG", "SONY", "LG", "LG", "LG", "LG", "LG", "PANASONIC", > >>>>> "SONY", "SHARP", "SONY", "PANASONIC", "SONY", "SAMSUNG", "AOC", > >>>>> "PANASONIC", "SHARP", "AOC"), producto = c("SMART TV LED FHD 48\" 3D > >>>>> 48J6400", > >>>>> "SMART TV LED FHD 40\" 40J5300", "TV LED FULL HD 40'' TC-40CS600L", > >>>>> "TELEVISOR LED LE28F6600 28\"", "SMART TV 40\" HD LE40K5000N", > >>>>> "TV LED HD 32'' LE32B7000", "SMART TV 32'' LE32K5000N", "TV LED FHD > >>> 55\" - > >>>>> LE55B8000", > >>>>> "TV LED LE40B8000 FULL HD 40\"", "TV LE24B8000 LED HD 24\" - NEGRO", > >>>>> "TV LED FULL HD 42'' TC-42AS610", "TELEVISOR LED LE50K5000N 50\"", > >>>>> "SMART TV LED UHD 40\" 40JU6500", "SMART TV ULTRA HD 48'' 48JU6500", > >>>>> "SMART TV 50JU6500 LED UHD 50\" - NEGRO", "SMART TV ULTRA HD 55'' 3D > >>>>> 55JS9000", > >>>>> "SMART TV LED UHD 55\" 55JU6500", "SMART TV ULTRA HD 55'' 55JU6700", > >>>>> "SMART TV CURVO 55JU7500 LED UHD 55\" 3D - NEGRO", "SMART TV ULTRA HD > >>> 65'' > >>>>> 3D 65JS9000", > >>>>> "SMART TV 65JU6500 LED UHD 65\"", "SMART TV ULTRA HD 65'' 65JU7500", > >>>>> "SMART TV LED UHD 75\" 75JU6500", "SMART TV WEB OS 40\" FULL HD > >>> 40LF6350", > >>>>> "SMART TV 3D 42\" FULL HD 42LF6400", "TV LED 42\" FULL HD CINEMA 3D > >>>>> 42LF6450", > >>>>> "TV LED 49\" FULL HD CINEMA 3D 49LF6450", "SMART TV LF6400 49\" FULL HD > >>>>> 3D", > >>>>> "TV 43UF6750 43\" ULTRA HD 4K", "TV 49\" ULTRA HD 4K 49UF6750", > >>>>> "TV LED 49\" ULTRA HD SMART UF6900", "SMART TV 49UF7700 49\" ULTRA HD > >>> 4K", > >>>>> "SMART TV 49UF8500 49\" ULTRA HD 4K 3D", "TV LED 55\" CINEMA 3D SMART > >>> TV > >>>>> 55UF7700", > >>>>> "SMART TV 65UF7700 65\" ULTRA HD 4K", "SMART TV 55UF8500 55\" ULTRA HD > >>> 4K > >>>>> 3D", > >>>>> "TV LED 55\" ULTRA HD 4K SMART TC-55CX640W", "TV LED 50\" ULTRA HD 4K > >>> SMART > >>>>> TC-50CX640W", > >>>>> "SMART TV 70UF7700 3D ULTRA HD 70\"", "TV LED CURVO 65\" ULTRA HD 4K > >>> CINEMA > >>>>> SMART UG8700", > >>>>> "TV LED 60\" FULL HD SMART LF6350", "SMART TV KDL-50FA95C 50\" FULL HD > >>> 3D", > >>>>> "SMART TV KDL50W805C 50\" FULL HD 3D", "TV LED 40\" FULL HD > >>> KDL-40R354B", > >>>>> "SMART TV LED FULL HD 40'' 40J5500", "SMART TV LED FULL HD 50'' > >>> 50J5500", > >>>>> "TV LED HD 32'' 32JH4005", "SMART TV LED FULL HD 50\" 50J5300", > >>>>> "SMART TV LED 48\" FULL HD 48J5300", "SMART TV FULL HD 40'' 3D > >>> 40J6400", > >>>>> "TV LED 32\" HD SMART KDL-32R505C", "TV LED 40\" SMART FULL HD > >>> KDL-40R555C > >>>>> - NEGRO", > >>>>> "SMART TV LED FHD 55\" 3D 55J6400", "TV 40JH5005 LED FHD 40\" - NEGRO", > >>>>> "TV 43\" FULL HD 43LF5410", "SMART TV 32LF585B LED HD 32\" - BLANCO", > >>>>> "TV LED 49\" FULL HD SMART 49LF5900", "SMART TV 65\" FULL HD 3D > >>>>> KDL-65W855C", > >>>>> "SMART TV LED FHD 48\" UN48J6500", "TV LED 40\" FULL HD LE40F1551", > >>>>> "TV LED 32'' SMART HD TC-32AS600L", "TV LED 32'' HD KDL-32R304B", > >>>>> "TV OLED 55\" SMART 3D FULL HD 55EC9300 PLATEADO", "TV LED HD 32'' > >>>>> LE32W454F", > >>>>> "TV LED 58\" ULTRA HD SMART 58UF8300", "TV LED 55\" FULL HD SMART 3D > >>>>> KDL-55W805C", > >>>>> "TV LED 49\" ULTRA HD 4K XBR-49X835C", "TV LED 55\" ULTRA HD 4K > >>>>> XBR-55X855C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-65X905C", "TV LED 75\" > >>> ULTRA HD > >>>>> 4K 3D XBR-75X945C", > >>>>> "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-55X905C", "SMART TV LED 60'' > >>>>> ULTRA HD 4K LC60UE30U", > >>>>> "SMART TV LED 70'' ULTRA HD 4K LC70UE30U", "SMART TV LED 80'' ULTRA HD > >>> 4K > >>>>> LC80UE30U", > >>>>> "SMART TV LED FULL HD 48'' 48J5500", "SMART TV CURVO 79UG8800 79\" > >>> ULTRA HD > >>>>> 4K 3D", > >>>>> "SMART TV 65UF9500 65\" ULTRA HD 4K 3D", "SMART TV 65UF8500 65\" ULTRA > >>> HD > >>>>> 4K 3D", > >>>>> "SMART TV 55UF9500 55\" ULTRA HD 4K 3D", "SMART TV LED HD 32\" > >>> 32J4300", > >>>>> "TV LED 48\" SMART FULL HD KDL-48R555C - NEGRO", "SMART TV 55UG8700 > >>> 55\" > >>>>> ULTRA HD 4K 3D", > >>>>> "SMART TV 60UF8500 60\" ULTRA HD 4K 3D", "SMART TV 55LF6500 55\" FULL > >>> HD > >>>>> 3D", > >>>>> "TV 32LF550B 32\" HD", "TV LED 47\" FULL HD 47LB5610", "TV LED FULL HD > >>> 50'' > >>>>> TC-50AS600L", > >>>>> "TV SMART LED 55\" UHD 3D XBR-55X855B", "TV LED FULL HD 4K LC70SQ17U > >>> 70''", > >>>>> "TV LED SMART UHD 79\" XBR-79X905B", "TV LED FULL HD 40'' TC-40A400L", > >>>>> "TV LED SMART UHD 70\" XBR-70X855B", "SMART TV UHD 55'' 3D CURVO > >>> 55HU8700", > >>>>> "TV FULL HD LE40D3142 40\" - NEGRO", "TELEVISOR LED 42\" TC-42AS650L", > >>>>> "SMART TV LCD FHD 70\" LC70LE660", "TV LED FULL HD 58'' LE58D3140" > >>>>> ), pulgadas = c(48L, 40L, 40L, 28L, 40L, 32L, 32L, 55L, 40L, > >>>>> 24L, 42L, 50L, 40L, 48L, 50L, 55L, 55L, 55L, 55L, 65L, 65L, 65L, > >>>>> 75L, 40L, 42L, 42L, 49L, 49L, 43L, 49L, 49L, 49L, 49L, 55L, 65L, > >>>>> 55L, 55L, 50L, 70L, 65L, 60L, 50L, 50L, 40L, 40L, 50L, 32L, 50L, > >>>>> 48L, 40L, 32L, 40L, 55L, 40L, 43L, 32L, 49L, 65L, 48L, 40L, 32L, > >>>>> 32L, 55L, 32L, 58L, 55L, 49L, 55L, 55L, 75L, 55L, 60L, 70L, 80L, > >>>>> 48L, 79L, 65L, 65L, 55L, 32L, 48L, 55L, 60L, 55L, 32L, 47L, 50L, > >>>>> 55L, 70L, 79L, 40L, 70L, 55L, 40L, 42L, 70L, 58L), precio.antes > >>> c(2799L, > >>>>> 1799L, 1699L, 599L, 1299L, 699L, 999L, 1999L, 999L, 499L, 1899L, > >>>>> 1799L, 2499L, 3999L, 3699L, 10999L, 4299L, 5499L, 6999L, 14999L, > >>>>> 8999L, 9999L, 14599L, 1999L, 2299L, 2299L, 2899L, 2999L, 2299L, > >>>>> 23992L, 3599L, 3799L, 4799L, 4999L, 8499L, 5999L, 4999L, 3999L, > >>>>> 11999L, 10999L, 4399L, 4499L, 3799L, 1399L, 2299L, 2799L, 999L, > >>>>> 2199L, 2299L, 2299L, 1299L, 1699L, 3499L, 1399L, 1549L, 1299L, > >>>>> 2399L, 6499L, 2999L, 999L, 1249L, 999L, 14999L, 799L, 5999L, > >>>>> 4499L, 4999L, 6499L, 12999L, 24999L, 8999L, 5999L, 7599L, 14999L, > >>>>> 2499L, 29999L, 13999L, 9999L, 9699L, 1299L, 2399L, 6999L, 7999L, > >>>>> 3699L, 999L, 1899L, 2999L, 7999L, 8499L, 24999L, 1399L, 13999L, > >>>>> 8499L, 999L, 2599L, 5799L, 2399L), precio.nuevo = c(2299, 1399, > >>>>> 1299, 549, 1099, 629, 799, 1699, 849, 439, 1499, 1549, 1759.2, > >>>>> 2099.3, 2309.3, 7699.3, 2799.3, 3639.3, 4899.3, 10499.3, 5109.3, > >>>>> 6999.3, 10219.3, 1399, 1599, 1599, 2199, 2199, 1299, 23992, 2299, > >>>>> 2299, 2899, 2999, 5999, 3899, 4999, 3999, 8999, 6999, 4099, 3999, > >>>>> 3499, 1299, 1799, 2399, 799, 2199, 1799, 1999, 1199, 1599, 2999, > >>>>> 1199, 1399, 1099, 1999, 5999, 2799, 999, 1199, 949, 7999, 799, > >>>>> 5299, 4299, 3999, 5999, 11999, 23999, 7999, 5699, 7599, 14499, > >>>>> 2399, 29999, 11999, 8999, 7499, 1099, 2199, 6599, 7099, 3599, > >>>>> 899, 1599, 2199, 4999, 6499, 19999, 1399, 9999, 5999, 999, 2599, > >>>>> 5699, 2399), dif.precios = c(500, 400, 400, 50, 200, 70, 200, > >>>>> 300, 150, 60, 400, 250, 739.8, 1899.7, 1389.7, 3299.7, 1499.7, > >>>>> 1859.7, 2099.7, 4499.7, 3889.7, 2999.7, 4379.7, 600, 700, 700, > >>>>> 700, 800, 1000, 0, 1300, 1500, 1900, 2000, 2500, 2100, 0, 0, > >>>>> 3000, 4000, 300, 500, 300, 100, 500, 400, 200, 0, 500, 300, 100, > >>>>> 100, 500, 200, 150, 200, 400, 500, 200, 0, 50, 50, 7000, 0, 700, > >>>>> 200, 1000, 500, 1000, 1000, 1000, 300, 0, 500, 100, 0, 2000, > >>>>> 1000, 2200, 200, 200, 400, 900, 100, 100, 300, 800, 3000, 2000, > >>>>> 5000, 0, 4000, 2500, 0, 0, 100, 0), dif.porcentual = c(17.86, > >>>>> 22.23, 23.54, 8.35, 15.4, 10.01, 20.02, 15.01, 15.02, 12.02, > >>>>> 21.06, 13.9, 29.6, 47.5, 37.57, 30, 34.88, 33.82, 30, 30, 43.22, > >>>>> 30, 30, 30.02, 30.45, 30.45, 24.15, 26.68, 43.5, 0, 36.12, 39.48, > >>>>> 39.59, 40.01, 29.42, 35.01, 0, 0, 25, 36.37, 6.82, 11.11, 7.9, > >>>>> 7.15, 21.75, 14.29, 20.02, 0, 21.75, 13.05, 7.7, 5.89, 14.29, > >>>>> 14.3, 9.68, 15.4, 16.67, 7.69, 6.67, 0, 4, 5.01, 46.67, 0, 11.67, > >>>>> 4.45, 20, 7.69, 7.69, 4, 11.11, 5, 0, 3.33, 4, 0, 14.29, 10, > >>>>> 22.68, 15.4, 8.34, 5.72, 11.25, 2.7, 10.01, 15.8, 26.68, 37.5, > >>>>> 23.53, 20, 0, 28.57, 29.42, 0, 0, 1.72, 0), rangos = c("S/.1500 - > >>> S/.2500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - > >>>>> S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.500 - S/.1500", "< S/.500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.3500 - S/.4500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "S/.500 - > >>> S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "> S/.4,500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.2500 - S/.3500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "> S/.4,500", "S/.3500 - S/.4500", > >>>>> "> S/.4,500", "> S/.4,500", "S/.3500 - S/.4500", "S/.3500 - S/.4500", > >>>>> "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.500 - S/.1500", > >>>>> "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.500 - S/.1500", > >>>>> "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > >>>>> "> S/.4,500", "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.500 - > >>> S/.1500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "S/.500 - S/.1500", "> S/.4,500", > >>>>> "S/.3500 - S/.4500", "S/.3500 - S/.4500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.500 - S/.1500", "S/.1500 - S/.2500", "> S/.4,500", > >>>>> "> S/.4,500", "S/.3500 - S/.4500", "S/.500 - S/.1500", "S/.1500 - > >>> S/.2500", > >>>>> "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > >>>>> "S/.500 - S/.1500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", > >>>>> "S/.2500 - S/.3500", "> S/.4,500", "S/.1500 - S/.2500")), .Names > >>> c("id", > >>>>> "marca", "producto", "pulgadas", "precio.antes", "precio.nuevo", > >>>>> "dif.precios", "dif.porcentual", "rangos"), class = "data.frame", > >>> row.names > >>>>> = c(NA, > >>>>> -97L)) > > > > ______________________________________________ > > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > > https://stat.ethz.ch/mailman/listinfo/r-help > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > > and provide commented, minimal, self-contained, reproducible code. > > David Winsemius > Alameda, CA, USA > >