Hi:
I am using the package "tm" for text-mining of CMP patents.
I use findAssocs() function to find the words which associated my dictionary
(technical names).
Now, I want to use these associate words to search for which documents are
contain.
For example:
From the result of findAssocs(gram_dtm, dictionary_word, 0.5).
$apparatus
apparatus polishing glycerin method high apparatus
comprising
0.66 0.54 0.54
0.53
It shows "apparatus" is associate with "apparatus
polishing", "glycerin", " method high" and
"apparatus comprising".
How do I use the set of words to do the following works?
1. Search which documents have appear?
2. The words frequency in each documents ?
3. How to save these documents?
My code:?@
#Load the text mining package(s)
library("tm")
library("wordcloud")
library(ggplot2)
#Build Corpus
cluster1_df<- read.csv("cluster_1.csv",stringsAsFactors = F)
cluster1_combined <- cluster1_df[,c(3,4,5)]
corpus <- Corpus(DataframeSource(cluster1_combined))
inspect(corpus)
#Pre-processing and tranforming the Corpus
myStopwords <- c(stopwords("english"),
stopwords("SMART"),"claim")
corpus_tm <- tm_map(corpus, content_transformer(tolower))
corpus_tm <- tm_map(corpus_tm, removeWords, myStopwords)
corpus_tm <- tm_map(corpus_tm, removeNumbers)
corpus_tm <- tm_map(corpus_tm, removePunctuation)
corpus_tm <- tm_map(corpus_tm, stripWhitespace)
corpus_tm <- tm_map(corpus_tm, stemDocument)
inspect(corpus_tm)
library(RWeka)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max =
3))
gram_dtm <- DocumentTermMatrix(corpus_tm, control = list(tokenize =
BigramTokenizer,
weighting = function(x)
weightTfIdf(x, normalize = TRUE)))
gram_dtm <- removeSparseTerms(gram_dtm, 1-(5/length(corpus_tm)))
inspect(gram_dtm)
dictionary_word <- c("neutral", "abras",
"particl", "acid", "apparatus", "back
film", "basic", "carrier",
"chemic", "chromat","confoc",
"clean", "cmp", "compens type",
"compress",
"comsum", "control",
"pressur", "dresser", "condition",
"detect",
"flow","rate", "fractal",
"groov", "hard", "improv type",
"infrar",
"laser", "layer", "measur",
"micro stuctur", "monitor",
"multi layer", "none-por",
"nonwoven", "pad", "pad applic", "pad
condit",
"pad materi", "pad properti", "pad
structur", "ph","planet", "plate",
"plat", "ratio", "polish
head", "polish system", "polym",
"polyurethan",
"porous", "process","
paramet", "path", "time", "recoveri",
"speed",
"rough", "scatter",
"semiconductor", "sensor", "signal", "singl
layer",
"slurri", "flow rate",
"stirrer", "slurri suppli",
"temperatur", "weight
percentag","wt", "storag cmp", "stylus
profil", "substrat cmp",
"thick", "transfer robot",
"ultrason", "urethan", "wafer cassett",
"wafer transfer",
"white light interferomet", "youngs
modulus")
onto_assocs<- findAssocs(gram_dtm, dictionary_word, 0.5)
[[alternative HTML version deleted]]