arnaud gaboury
2015-Feb-12 14:40 UTC
[R] gsub : replace regex pattern with values from another data.frame
I have two df (and dt): df1 structure(list(name = c("poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", "cruzecontrol", "agreenmamba", "agreenmamba", "vairis", "vairis", "vairis", "vairis", "vairis", "vairis", "xaeth"), text = c("ok", "need items ?", "i didn't submit pass codes for a long now", "ok", "<@U03AEKYL4>: what app are you talking about ?", "some testing with my irc client", "ha ha sorry", "for me there is no such question", "Lol.", "<@U03AEKWTL|agreenmamba> uploaded a file: <https://enlightened.slack.com/files/agreenmamba/F03KGRF3W/screenshot_2015-02-09-14-31-15.png|regarding: should I stay or should I go?>", "<@U032FHV3S> <http://youtu.be/oGIFublvDes>", "ok, see you around", "yeah, I had a procrastination rush so I started to decode a little", "<http://ingress.com/intel|ingress.com/intel> when you submit passcodes", "intel", "what is the cooldown time or how does it work...;", "anybody knows how does \"Passcode circuitry too hot. Wait for cool down to enter another passcode.\" works?", "and people told that agent their geocities experience would never amount to anything (the convo yesterday) " ), ts = c("1423594336.000138", "1423594311.000136", "1423594294.000135", "1423594258.000133", "1423594244.000131", "1423497058.000127", "1423497041.000126", "1423478555.000123", "1423494427.000125", "1423492370.000124", "1423478364.000121", "1423594358.000139", "1423594329.000137", "1423594264.000134", "1423594251.000132", "1423592204.000130", "1423592174.000129", "1423150354.000112" )), .Names = c("name", "text", "ts"), class = c("data.table", "data.frame"), row.names = c(NA, -18L)) df2 structure(list(id = c("U03KH8Z52", "U02AF1DTJ", "U02AF0ZT8", "U03AEKWTL", "U02BCJH0G", "U033YA1MS", "U029QMCRR", "U03H139M5", "U02AET1D0", "U02A6U41Z", "U02B5T4CX", "U02B2QU4R", "U03F0LQ5X", "U03JNFKLY", "U02ASMBMQ", "U029QLQC7", "U03AEMBQU", "U02B4D3Q1", "U02AGDC14", "U029A467C", "U02A7NFG6", "U02AESPPL", "U02AQANK7", "U03ADJDFK", "U03EYR0KB", "U02AW7Q5Q", "U02AE8RKD", "U02FT84BS", "U02B25M3B", "U03EZDQT7", "U02AECKFF", "U03H2691M", "U02DWTJ5V", "U02AFTAHH", "U029QQEPM", "U03C51Z42", "U02CAK2CV", "U03AK21DP", "U03FFN8ED", "U02B23V03", "U029T2143", "U02C1LEEX", "U03AF2QH2", "U03E0GN0S", "U03AG20R9", "U02AES8S2", "U02AG64S7", "U02B5A0R7", "U02AS4SLR", "U03C2SG0R", "U03AV7CCW", "U032XPFDU", "U03AUKSSV", "U02C2A61Y", "U02AESHJQ", "U02BLSKHU", "U02E34WM6", "U03AK6P26", "U02E6ADRZ", "U03FCDQ50", "U03EW1CC5", "U02BL0DBD", "U02FHQZ6D", "U02B47T63", "U03H2TTQP", "U03AVP71V", "U03JLV38V", "U02E39HAY", "U02AE5281", "U032FHV3S", "U03AL2096", "U02ARUG6M", "U02AECRSP", "U02B42XG4", "U03AFQZNS", "U02AE7H41", "U03G9UNTG", "U02GEQ0E6", "U02AGLE5A", "U02BQTRC9", "U03H0J6GS", "U02B3D27F", "U02AEKTHV", "U02C52YN3", "U02E33MUW", "U03AKUT85", "U03B53EHG", "U02FBN38P", "U03AH3E5W", "U02B5PLE0", "U02AS4RCK", "U03ANE1GZ", "U02E8LZQB", "U03EPGJ98", "U02E3N220", "U03AEKYL4", "U02AE7HT1", "U02C1RR3G", "U03JH408J", "U03KL0FKN", "U02B44R92", "U03EURWGX"), name = c("10k_affair", "1upwuzhere", "4xcss", "agreenmamba", "ait109", "arly69", "azkop13", "barcik75", "bigolnob", "blackrose", "blink619", "bobaloo23", "bodger", "bomb", "bootswithdefer", "brandizzle", "bregalad", "camon", "celticrain", "ch3mical", "checksum", "cocothunder", "cruxicon", "cruzecontrol", "crystalskunk", "cscheetah", "dabcelin", "deelicious", "delthanar", "drkaosdk", "droidenl-joe", "dukeceph", "fillerbunny", "flickohmsford", "flyingg0d", "garaxiel", "goby9", "gymbal", "hideandseek", "hobojr", "ijackportals", "invalidcharactr", "itso9", "j0shs", "jarvis", "jc0mm5", "jencyberchic", "jimbobradyson", "joespr0cket", "jostrander", "jueliet", "karlashi", "khan99", "kingkonn0r", "krispycridder", "kritickalmass", "lawgiver", "maxcorbett", "memory556", "meta000x", "minkovsky", "mistylady", "mstephans", "mstrinity", "nocarryr", "ollietronic", "philistine11", "pickledpickles", "piercingsbykris", "poisonivy", "raugmor", "remarks999", "rheds77", "rhinz", "rigiritter", "robbie0017", "rohdef", "ryoziya", "s4n1ty", "sacredcow133", "samwill", "sgtlemonpepper", "sivan", "spline9", "starwolf", "stueliueli", "sweetiris", "swift2plunder", "swissphoenix", "synyck", "test", "therug", "tinja551", "trulyjuan", "twinster", "vairis", "vinylz3ro", "watervirus", "xaeth", "yagamiyukari", "zafo", "zexium")), .Names = c("id", "name"), class = c("data.table", "data.frame"), row.names = c(NA, -102L)) I need to replace this regex pattern in df1 : (?<=<@)[^|]{9}(?=>|) by its corresponding name from df2. E.g : if <@U03KH8Z52> is found in df1, then I want to replace it by the "name" which correspond to this id in df2., in this case 10k_affair I know of replace an expression with gsub: gsub('(?<=<@)[^|]{9}(?=>|)', 'toto', df1, perl = T) but I have no idea how to replace it with value from another df. Thank you for hints
arnaud gaboury
2015-Feb-12 18:12 UTC
[R] gsub : replace regex pattern with values from another data.frame
On Thu, Feb 12, 2015 at 3:40 PM, arnaud gaboury <arnaud.gaboury at gmail.com> wrote:> I have two df (and dt): > > df1 > structure(list(name = c("poisonivy", "poisonivy", "poisonivy", > "poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", > "cruzecontrol", "agreenmamba", "agreenmamba", "vairis", "vairis", > "vairis", "vairis", "vairis", "vairis", "xaeth"), text = c("ok", > "need items ?", "i didn't submit pass codes for a long now", > "ok", "<@U03AEKYL4>: what app are you talking about ?", "some testing > with my irc client", > "ha ha sorry", "for me there is no such question", "Lol.", > "<@U03AEKWTL|agreenmamba> uploaded a file: > <https://enlightened.slack.com/files/agreenmamba/F03KGRF3W/screenshot_2015-02-09-14-31-15.png|regarding: > should I stay or should I go?>", > "<@U032FHV3S> <http://youtu.be/oGIFublvDes>", "ok, see you around", > "yeah, I had a procrastination rush so I started to decode a little", > "<http://ingress.com/intel|ingress.com/intel> when you submit passcodes", > "intel", "what is the cooldown time or how does it work...;", > "anybody knows how does \"Passcode circuitry too hot. Wait for cool > down to enter another passcode.\" works?", > "and people told that agent their geocities experience would never > amount to anything (the convo yesterday) " > ), ts = c("1423594336.000138", "1423594311.000136", "1423594294.000135", > "1423594258.000133", "1423594244.000131", "1423497058.000127", > "1423497041.000126", "1423478555.000123", "1423494427.000125", > "1423492370.000124", "1423478364.000121", "1423594358.000139", > "1423594329.000137", "1423594264.000134", "1423594251.000132", > "1423592204.000130", "1423592174.000129", "1423150354.000112" > )), .Names = c("name", "text", "ts"), class = c("data.table", > "data.frame"), row.names = c(NA, -18L)) > > df2 > structure(list(id = c("U03KH8Z52", "U02AF1DTJ", "U02AF0ZT8", > "U03AEKWTL", "U02BCJH0G", "U033YA1MS", "U029QMCRR", "U03H139M5", > "U02AET1D0", "U02A6U41Z", "U02B5T4CX", "U02B2QU4R", "U03F0LQ5X", > "U03JNFKLY", "U02ASMBMQ", "U029QLQC7", "U03AEMBQU", "U02B4D3Q1", > "U02AGDC14", "U029A467C", "U02A7NFG6", "U02AESPPL", "U02AQANK7", > "U03ADJDFK", "U03EYR0KB", "U02AW7Q5Q", "U02AE8RKD", "U02FT84BS", > "U02B25M3B", "U03EZDQT7", "U02AECKFF", "U03H2691M", "U02DWTJ5V", > "U02AFTAHH", "U029QQEPM", "U03C51Z42", "U02CAK2CV", "U03AK21DP", > "U03FFN8ED", "U02B23V03", "U029T2143", "U02C1LEEX", "U03AF2QH2", > "U03E0GN0S", "U03AG20R9", "U02AES8S2", "U02AG64S7", "U02B5A0R7", > "U02AS4SLR", "U03C2SG0R", "U03AV7CCW", "U032XPFDU", "U03AUKSSV", > "U02C2A61Y", "U02AESHJQ", "U02BLSKHU", "U02E34WM6", "U03AK6P26", > "U02E6ADRZ", "U03FCDQ50", "U03EW1CC5", "U02BL0DBD", "U02FHQZ6D", > "U02B47T63", "U03H2TTQP", "U03AVP71V", "U03JLV38V", "U02E39HAY", > "U02AE5281", "U032FHV3S", "U03AL2096", "U02ARUG6M", "U02AECRSP", > "U02B42XG4", "U03AFQZNS", "U02AE7H41", "U03G9UNTG", "U02GEQ0E6", > "U02AGLE5A", "U02BQTRC9", "U03H0J6GS", "U02B3D27F", "U02AEKTHV", > "U02C52YN3", "U02E33MUW", "U03AKUT85", "U03B53EHG", "U02FBN38P", > "U03AH3E5W", "U02B5PLE0", "U02AS4RCK", "U03ANE1GZ", "U02E8LZQB", > "U03EPGJ98", "U02E3N220", "U03AEKYL4", "U02AE7HT1", "U02C1RR3G", > "U03JH408J", "U03KL0FKN", "U02B44R92", "U03EURWGX"), name = c("10k_affair", > "1upwuzhere", "4xcss", "agreenmamba", "ait109", "arly69", "azkop13", > "barcik75", "bigolnob", "blackrose", "blink619", "bobaloo23", > "bodger", "bomb", "bootswithdefer", "brandizzle", "bregalad", > "camon", "celticrain", "ch3mical", "checksum", "cocothunder", > "cruxicon", "cruzecontrol", "crystalskunk", "cscheetah", "dabcelin", > "deelicious", "delthanar", "drkaosdk", "droidenl-joe", "dukeceph", > "fillerbunny", "flickohmsford", "flyingg0d", "garaxiel", "goby9", > "gymbal", "hideandseek", "hobojr", "ijackportals", "invalidcharactr", > "itso9", "j0shs", "jarvis", "jc0mm5", "jencyberchic", "jimbobradyson", > "joespr0cket", "jostrander", "jueliet", "karlashi", "khan99", > "kingkonn0r", "krispycridder", "kritickalmass", "lawgiver", "maxcorbett", > "memory556", "meta000x", "minkovsky", "mistylady", "mstephans", > "mstrinity", "nocarryr", "ollietronic", "philistine11", "pickledpickles", > "piercingsbykris", "poisonivy", "raugmor", "remarks999", "rheds77", > "rhinz", "rigiritter", "robbie0017", "rohdef", "ryoziya", "s4n1ty", > "sacredcow133", "samwill", "sgtlemonpepper", "sivan", "spline9", > "starwolf", "stueliueli", "sweetiris", "swift2plunder", "swissphoenix", > "synyck", "test", "therug", "tinja551", "trulyjuan", "twinster", > "vairis", "vinylz3ro", "watervirus", "xaeth", "yagamiyukari", > "zafo", "zexium")), .Names = c("id", "name"), class = c("data.table", > "data.frame"), row.names = c(NA, -102L)) > > I need to replace this regex pattern in df1 : > (?<=<@)[^|]{9}(?=>|) by its corresponding name from df2. > > E.g : if <@U03KH8Z52> is found in df1, then I want to replace it by > the "name" which correspond to this id in df2., in this case > 10k_affair > > I know of replace an expression with gsub: > gsub('(?<=<@)[^|]{9}(?=>|)', 'toto', df1, perl = T) > but I have no idea how to replace it with value from another df. > > Thank you for hintsI am gathering some pieces of the puzzles.> regmatches(df1$text,regexpr('(?<=<@)[^|]{9}(?=>|)',df1$text, perl = T))[1] "U032FHV3S" "U03AEKWTL" "U03AEKYL4" The above commands extract the needed pattern df2[grep("U032FHV3S",df2$id),][[2]] [1] "poisonivy" The above command returns the name in the same row than the id. I need more than one name (in my case, I need 3) Shall I now write a loop and get a list of my needed name ? Pseudo code would be something like: for i %in% regmatches(df1$text,regexpr('(?<=<@)[^|]{9}(?=>|)',df1$text, perl = T)) df2[grep("i",df2$id),][[2]] Thank you for hint about how I shall proceed. -- google.com/+arnaudgabourygabx
arnaud gaboury
2015-Feb-12 18:31 UTC
[R] gsub : replace regex pattern with values from another data.frame
On Thu, Feb 12, 2015 at 7:12 PM, arnaud gaboury <arnaud.gaboury at gmail.com> wrote:> On Thu, Feb 12, 2015 at 3:40 PM, arnaud gaboury > <arnaud.gaboury at gmail.com> wrote: >> I have two df (and dt): >> >> df1 >> structure(list(name = c("poisonivy", "poisonivy", "poisonivy", >> "poisonivy", "poisonivy", "poisonivy", "poisonivy", "poisonivy", >> "cruzecontrol", "agreenmamba", "agreenmamba", "vairis", "vairis", >> "vairis", "vairis", "vairis", "vairis", "xaeth"), text = c("ok", >> "need items ?", "i didn't submit pass codes for a long now", >> "ok", "<@U03AEKYL4>: what app are you talking about ?", "some testing >> with my irc client", >> "ha ha sorry", "for me there is no such question", "Lol.", >> "<@U03AEKWTL|agreenmamba> uploaded a file: >> <https://enlightened.slack.com/files/agreenmamba/F03KGRF3W/screenshot_2015-02-09-14-31-15.png|regarding: >> should I stay or should I go?>", >> "<@U032FHV3S> <http://youtu.be/oGIFublvDes>", "ok, see you around", >> "yeah, I had a procrastination rush so I started to decode a little", >> "<http://ingress.com/intel|ingress.com/intel> when you submit passcodes", >> "intel", "what is the cooldown time or how does it work...;", >> "anybody knows how does \"Passcode circuitry too hot. Wait for cool >> down to enter another passcode.\" works?", >> "and people told that agent their geocities experience would never >> amount to anything (the convo yesterday) " >> ), ts = c("1423594336.000138", "1423594311.000136", "1423594294.000135", >> "1423594258.000133", "1423594244.000131", "1423497058.000127", >> "1423497041.000126", "1423478555.000123", "1423494427.000125", >> "1423492370.000124", "1423478364.000121", "1423594358.000139", >> "1423594329.000137", "1423594264.000134", "1423594251.000132", >> "1423592204.000130", "1423592174.000129", "1423150354.000112" >> )), .Names = c("name", "text", "ts"), class = c("data.table", >> "data.frame"), row.names = c(NA, -18L)) >> >> df2 >> structure(list(id = c("U03KH8Z52", "U02AF1DTJ", "U02AF0ZT8", >> "U03AEKWTL", "U02BCJH0G", "U033YA1MS", "U029QMCRR", "U03H139M5", >> "U02AET1D0", "U02A6U41Z", "U02B5T4CX", "U02B2QU4R", "U03F0LQ5X", >> "U03JNFKLY", "U02ASMBMQ", "U029QLQC7", "U03AEMBQU", "U02B4D3Q1", >> "U02AGDC14", "U029A467C", "U02A7NFG6", "U02AESPPL", "U02AQANK7", >> "U03ADJDFK", "U03EYR0KB", "U02AW7Q5Q", "U02AE8RKD", "U02FT84BS", >> "U02B25M3B", "U03EZDQT7", "U02AECKFF", "U03H2691M", "U02DWTJ5V", >> "U02AFTAHH", "U029QQEPM", "U03C51Z42", "U02CAK2CV", "U03AK21DP", >> "U03FFN8ED", "U02B23V03", "U029T2143", "U02C1LEEX", "U03AF2QH2", >> "U03E0GN0S", "U03AG20R9", "U02AES8S2", "U02AG64S7", "U02B5A0R7", >> "U02AS4SLR", "U03C2SG0R", "U03AV7CCW", "U032XPFDU", "U03AUKSSV", >> "U02C2A61Y", "U02AESHJQ", "U02BLSKHU", "U02E34WM6", "U03AK6P26", >> "U02E6ADRZ", "U03FCDQ50", "U03EW1CC5", "U02BL0DBD", "U02FHQZ6D", >> "U02B47T63", "U03H2TTQP", "U03AVP71V", "U03JLV38V", "U02E39HAY", >> "U02AE5281", "U032FHV3S", "U03AL2096", "U02ARUG6M", "U02AECRSP", >> "U02B42XG4", "U03AFQZNS", "U02AE7H41", "U03G9UNTG", "U02GEQ0E6", >> "U02AGLE5A", "U02BQTRC9", "U03H0J6GS", "U02B3D27F", "U02AEKTHV", >> "U02C52YN3", "U02E33MUW", "U03AKUT85", "U03B53EHG", "U02FBN38P", >> "U03AH3E5W", "U02B5PLE0", "U02AS4RCK", "U03ANE1GZ", "U02E8LZQB", >> "U03EPGJ98", "U02E3N220", "U03AEKYL4", "U02AE7HT1", "U02C1RR3G", >> "U03JH408J", "U03KL0FKN", "U02B44R92", "U03EURWGX"), name = c("10k_affair", >> "1upwuzhere", "4xcss", "agreenmamba", "ait109", "arly69", "azkop13", >> "barcik75", "bigolnob", "blackrose", "blink619", "bobaloo23", >> "bodger", "bomb", "bootswithdefer", "brandizzle", "bregalad", >> "camon", "celticrain", "ch3mical", "checksum", "cocothunder", >> "cruxicon", "cruzecontrol", "crystalskunk", "cscheetah", "dabcelin", >> "deelicious", "delthanar", "drkaosdk", "droidenl-joe", "dukeceph", >> "fillerbunny", "flickohmsford", "flyingg0d", "garaxiel", "goby9", >> "gymbal", "hideandseek", "hobojr", "ijackportals", "invalidcharactr", >> "itso9", "j0shs", "jarvis", "jc0mm5", "jencyberchic", "jimbobradyson", >> "joespr0cket", "jostrander", "jueliet", "karlashi", "khan99", >> "kingkonn0r", "krispycridder", "kritickalmass", "lawgiver", "maxcorbett", >> "memory556", "meta000x", "minkovsky", "mistylady", "mstephans", >> "mstrinity", "nocarryr", "ollietronic", "philistine11", "pickledpickles", >> "piercingsbykris", "poisonivy", "raugmor", "remarks999", "rheds77", >> "rhinz", "rigiritter", "robbie0017", "rohdef", "ryoziya", "s4n1ty", >> "sacredcow133", "samwill", "sgtlemonpepper", "sivan", "spline9", >> "starwolf", "stueliueli", "sweetiris", "swift2plunder", "swissphoenix", >> "synyck", "test", "therug", "tinja551", "trulyjuan", "twinster", >> "vairis", "vinylz3ro", "watervirus", "xaeth", "yagamiyukari", >> "zafo", "zexium")), .Names = c("id", "name"), class = c("data.table", >> "data.frame"), row.names = c(NA, -102L)) >> >> I need to replace this regex pattern in df1 : >> (?<=<@)[^|]{9}(?=>|) by its corresponding name from df2. >> >> E.g : if <@U03KH8Z52> is found in df1, then I want to replace it by >> the "name" which correspond to this id in df2., in this case >> 10k_affair >> >> I know of replace an expression with gsub: >> gsub('(?<=<@)[^|]{9}(?=>|)', 'toto', df1, perl = T) >> but I have no idea how to replace it with value from another df. >> >> Thank you for hints > > I am gathering some pieces of the puzzles. > >> regmatches(df1$text,regexpr('(?<=<@)[^|]{9}(?=>|)',df1$text, perl = T)) > [1] "U032FHV3S" "U03AEKWTL" "U03AEKYL4" > The above commands extract the needed pattern > > df2[grep("U032FHV3S",df2$id),][[2]] > [1] "poisonivy" > The above command returns the name in the same row than the id. I need > more than one name (in my case, I need 3) > > Shall I now write a loop and get a list of my needed name ? Pseudo > code would be something like: > > for i %in% regmatches(df1$text,regexpr('(?<=<@)[^|]{9}(?=>|)',df1$text, > perl = T)) > df2[grep("i",df2$id),][[2]] > > > Thank you for hint about how I shall proceed. >Better approach than a loop:> extrac <- regmatches(df1$text,regexpr('(?<=<@)[^|]{9}(?=>|)',df1$text, perl = T)) > extrac[1] "U032FHV3S" "U03AEKWTL" "U03AEKYL4"> df2[df2$id %in% extracid name 1: U03AEKWTL agreenmamba 2: U032FHV3S poisonivy 3: U03AEKYL4 vairis