Displaying 20 results from an estimated 20 matches for "removepunctuation".
2012 Jan 27
2
tm package: handling contractions
...a wordcloud of Obama's State of the Union address using
the tm package to process the text
sotu <- scan(file="c:/R/data/sotu2012.txt", what="character")
sotu <- tolower(sotu)
corp <-Corpus(VectorSource(paste(sotu, collapse=" ")))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, stemDocument)
corp <- tm_map(corp, function(x)removeWords(x,stopwords()))
tdm <- TermDocumentMatrix(corp)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
wordcloud(d$word,d$freq)
I ended up with a large numb...
2009 Nov 12
2
package "tm" fails to remove "the" with remove stopwords
...in", "jack and
jill ran up the hill", "to fetch a pail of water")
text.corp <- Corpus(VectorSource(myDocument))
#########################
text.corp <- tm_map(text.corp, stripWhitespace)
text.corp <- tm_map(text.corp, removeNumbers)
text.corp <- tm_map(text.corp, removePunctuation)
## text.corp <- tm_map(text.corp, stemDocument)
text.corp <- tm_map(text.corp, removeWords, c("the", stopwords("english")))
dtm <- DocumentTermMatrix(text.corp)
dtm
dtm.mat <- as.matrix(dtm)
dtm.mat
> dtm.mat
Terms
Docs falls fetch hill jack jill mainly pail...
2012 Oct 25
2
Minería de texto
...apply(tw.df$text, RemoveAtPeople)) #The following is cribbed and seems to do what it says on the can tw.corpus = Corpus(VectorSource(df)) tw.corpus = tm_map(tw.corpus, function(x) iconv(enc2utf8(x), sub = "byte")) tw.corpus = tm_map(tw.corpus, tolower) tw.corpus = tm_map(tw.corpus, removePunctuation) tw.corpus = tm_map(tw.corpus, function(x) removeWords(x, c(stopwords("spanish"),"rt"))) tw.corpus = tm_map(tw.corpus, removeWords, my.stopwords) tw.corpus = tm_map(tw.corpus, stripWhitespace) sw <- readLines("stopwords.es.txt",encoding="UTF-8") sw =...
2014 Jul 22
2
Ayuda Error in `colnames<-`(`*tmp*`, value = c(
...yte")
> d2<-readLines(txt2, encoding="UTF-8")
> d2<-iconv(enc2utf8(d2), sub = "byte")
> df<-c(d1,d2)
> corpus<-Corpus(VectorSource(df))
> d<-tm_map(corpus, content_transformer(tolower))
> d<-tm_map(d, stripWhitespace)
> d<-tm_map(d, removePunctuation)
> sw<-readLines("./StopWords.txt", encoding="UTF-8")
> sw<-iconv(enc2utf8(sw), sub="byte")
> d<-tm_map(d, removeWords, sw)
> d<-tm_map(d, removeWords, stopwords("spanish"))
> tdm<-TermDocumentMatrix(d)
> m<-as.matrix(tdm)...
2014 Jul 29
2
wordcloud y tabla de palabras [Avanzando]
...equire(tm)
require(wordcloud)
require(Rcpp)
tmpinformes<-data.frame(c("todo el informe 2005", "todo el informe
2013"), row.names=c("2005", "2013"))
ds<- DataframeSource(tmpText)
ds<- DataframeSource(tmpinformes)
corp = Corpus(ds)
corp = tm_map(corp,removePunctuation)
corp = tm_map(corp,content_transformer(tolower))
corp = tm_map(corp,removeNumbers)
corp = tm_map(corp, stripWhitespace)
corp = tm_map(corp, removeWords, sw)
corp = tm_map(corp, removeWords, stopwords("spanish"))
term.matrix<- TermDocumentMatrix(corp)
term.matrix<- as.matrix(term.ma...
2014 Jul 25
3
wordcloud y tabla de palabras
...e) {
info.dir<-sprintf("%s/%s", pathname, informes)
info.cor<-Corpus(DirSource(directory=info.dir, encoding="UTF-8"))
info.cor.cl<-tm_map(info.cor, content_transformer(tolower))
info.cor.cl<-tm_map(info.cor.cl, stripWhitespace)
info.cor.cl<-tm_map(info.cor.cl,removePunctuation)
sw<-readLines("C:/Users/d_2/Documents/StopWords.txt", encoding="UTF-8")
sw<-iconv(enc2utf8(sw), sub = "byte")
info.cor.cl<-tm_map(info.cor.cl, removeWords, stopwords("spanish"))
info.tdm<-TermDocumentMatrix(info.cor.cl)
result<-list(name...
2012 Dec 13
2
Tamaño de la matriz de términos y memoria. Paquete TM
...ye un corpus
corpus <- Corpus(VectorSource(txt))
# lleva a minúsculas
corpus <- tm_map(corpus, tolower)
# quita espacios en blanco
corpus <- tm_map(corpus, stripWhitespace)
# remueve la puntuación
corpus <- tm_map(corpus, removePunctuation)
# carga el archivo de palabras vacías personalizada en español y lo convierte a ASCII
sw <- readLines("D:/Publico/Documents/TextMinigSpanishResources/Stopwords.es.txt",encoding="UTF-8")
sw = iconv(sw, to="ASCII//TRANSLIT")
# remueve...
2014 Jul 28
2
wordcloud y tabla de palabras
...uot;, pathname, informes)
> > info.cor<-Corpus(DirSource(directory=info.dir, encoding="UTF-8"))
> > info.cor.cl<-tm_map(info.cor, content_transformer(tolower))
> > info.cor.cl<-tm_map(info.cor.cl, stripWhitespace)
> > info.cor.cl<-tm_map(info.cor.cl,removePunctuation)
> > sw<-readLines("C:/Users/d_2/Documents/StopWords.txt", encoding="UTF-8")
> > sw<-iconv(enc2utf8(sw), sub = "byte")
> > info.cor.cl<-tm_map(info.cor.cl, removeWords, stopwords("spanish"))
> > info.tdm<-TermDocumentMa...
2014 Jun 17
2
No es un problema de tm tienes doc.corpus vacío
...rtal"inmortal = readLines(TEXTFILE)inmortal =
> readLines(TEXTFILE)length(inmortal)head(inmortal)tail(inmortal)library(tm)vec
> <- VectorSource(inmortal)corpus <-
> Corpus(vec)summary(corpus)inspect(corpus[1:7])corpus <- tm_map(corpus,
> tolower)corpus <- tm_map(corpus, removePunctuation)corpus <- tm_map(corpus,
> removeNumbers)corpus <- tm_map(corpus, removeWords,
> stopwords("english"))inspect(doc.corpus[1:2])library(SnowballC)corpus <-
> tm_map(corpus, stemDocument)corpus <- tm_map(corpus,
> stripWhitespace)inspect(doc.corpus[1:8])TDM <-
>...
2010 Feb 16
0
tm package
...to me
as if after the following
reuters21578 <- Corpus(DirSource(corpusDir), readerControl =
list(reader = readReut21578XMLasPlain))
reuters21578 <- tm_map(reuters21578, stripWhitespace)
reuters21578 <- tm_map(reuters21578, tolower)
reuters21578 <- tm_map(reuters21578, removePunctuation)
reuters21578 <- tm_map(reuters21578, removeNumbers)
reuters21578.dtm <- DocumentTermMatrix(reuters21578)
that reuters21578.dtm does not include terms from the Heading (e.g. the Title).
I'm wondering if anyone can confirm this and if so, is there an option
to have the terms from...
2011 Apr 18
0
Help with cleaning a corpus
Hi!
I created a corpus and I started to clean through this piece of code:
txt <-tm_map(txt,removeWords, stopwords("spanish"))
txt <-tm_map(txt,stripWhitespace)
txt <-tm_map(txt,tolower)
txt <-tm_map(txt,removeNumbers)
txt <-tm_map(txt,removePunctuation)
But something happpended: some of the documents in the corpus became empty,
this is a problem when i try to make a document term matrix with tfidf.
Is there any way to eliminate automatically a document if it become empty?
Or manually, how could i get the lenght of every document?
hope you...
2012 Feb 26
2
tm_map help
...chTwitter(hashTag, n=200)
df <- do.call("rbind", lapply(tweets, as.data.frame))
myCorpus <- Corpus(VectorSource(df$text))
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myStopwords <- c(stopwords('english'), "available", "via")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
dictCorpus <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
################ERROR HAPPENS...
2013 Sep 26
0
R hangs at NGramTokenizer
...pus <- tm_map(myCorpus, removeAmp)> removeWWW <- function(x) gsub("www[[:alnum:]]*", "", x)> myCorpus <- tm_map(myCorpus, removeWWW)> myCorpus <- tm_map(myCorpus, tolower)> myCorpus <- tm_map(myCorpus, removeNumbers)> myCorpus <- tm_map(myCorpus, removePunctuation)> myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))> myCorpus <- tm_map(myCorpus, removeWords, stopwords("SMART"))> myCorpus <- tm_map(myCorpus, stripWhitespace)> myDtm <- DocumentTermMatrix(myCorpus, control = list(wordLengths = c(1,Inf)))...
2014 Jun 18
2
No es un problema de tm tienes doc.corpus vacío
...readLines(TEXTFILE)length(inmortal)head(inmortal)tail(
> >> inmortal)library(tm)vec
> >> <- VectorSource(inmortal)corpus <-
> >> Corpus(vec)summary(corpus)inspect(corpus[1:7])corpus <-
> >> tm_map(corpus, tolower)corpus <- tm_map(corpus,
> >> removePunctuation)corpus <- tm_map(corpus, removeNumbers)corpus <-
> >> tm_map(corpus, removeWords,
> >>
> stopwords("english"))inspect(doc.corpus[1:2])library(SnowballC)corpus
> >> <- tm_map(corpus, stemDocument)corpus <- tm_map(corpus,
> >> stripWhitespa...
2011 Mar 24
2
Problem with Snowball & RWeka
Dear Forum,
when I try to use SnowballStemmer() I get the following error message:
"Could not initialize the GenericPropertiesCreator. This exception was
produced: java.lang.NullPointerException"
It seems to have something to do with either Snowball or RWeka, however I
can't figure out, what to do myself. If you could spend 5 minutes of your
valuable time, to help me or give me a
2014 Jun 18
3
No es un problema de tm tienes doc.corpus vacío
...ail(
> > > >> inmortal)library(tm)vec
> > > >> <- VectorSource(inmortal)corpus <-
> > > >> Corpus(vec)summary(corpus)inspect(corpus[1:7])corpus <-
> > > >> tm_map(corpus, tolower)corpus <- tm_map(corpus,
> > > >> removePunctuation)corpus <- tm_map(corpus, removeNumbers)corpus <-
> > > >> tm_map(corpus, removeWords,
> > > >>
> > > stopwords("english"))inspect(doc.corpus[1:2])library(SnowballC)corpus
> > > >> <- tm_map(corpus, stemDocument)corpus <- tm...
2012 Jan 13
4
Troubles with stemming (tm + Snowball packages) under MacOS
....
Here is the full source code (all the librairies are already loaded):
------
Sys.setenv(NOAWT=TRUE)
source <- ReutersSource("reuters-21578.xml", encoding="UTF-8")
reuters <- Corpus(source)
reuters <- tm_map(reuters, as.PlainTextDocument)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, tolower)
reuters <- tm_map(reuters, removeWords, stopwords("english"))
reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, stripWhitespace)
reuters <- tm_map(reuters, stemDocument)
------
Thank you for your help,
Julien
2017 Jun 12
0
count number of stop words in R
You can use regular expressions.
?regex and/or the stringr package are good places to start. Of
course, you have to define "stop words."
Cheers,
Bert
Bert Gunter
"The trouble with having an open mind is that people keep coming along
and sticking things into it."
-- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )
On Mon, Jun 12, 2017 at 5:40
2009 Jul 17
3
Ayuda con el paquete de text mining (TM)
Estimados, les escribo para consultar, lo siguiente:
Estoy haciendo un trabajo de text mining y necesito importar una serie de
textos para preprocesarlos, es decir eliminar los Stopwords, hacer stemming,
eliminar signos de puntuación etc. Esto último lo puedo realizar con los
datasets que trae la librería TM. Lo que no puedo lograr es importar texto
desde algún medio a pesar que existe funciones
2017 Jun 12
3
count number of stop words in R
Hi all,
Is there a way in R to count the number of stop words (English) of a string using tm package?
str="Mhm . Alright . There's um a young boy that's getting a cookie jar . And it he's uh in bad shape because uh the thing is falling over . And in the picture the mother is washing dishes and doesn't see it . And so is the the water is overflowing in the sink . And the