library("tm", lib.loc="~/Library/R/3.2/library")
library("wordcloud", lib.loc="~/Library/R/3.2/library")

filePath <- "/Volumes/Fortitudo/Documents/SyMoGIH/cours_lyon_3/cours_2015/annotation_notices_BNF/liste_notices_BNF_sans_url.txt"
text <- readLines(filePath)
# Charger les données comme un corpus
docs <- Corpus(VectorSource(text))
inspect(docs)
getReaders()
getTransformations()

# Convertir le texte en minuscule
docs <- tm_map(docs, content_transformer(tolower))
# Supprimer les mots vides anglais
docs1 <- tm_map(docs, removeWords, stopwords("french"))
print(docs1)
docs1[[3]]
names(docs1)
class(docs1)
show(docs1)

# Supprimer les ponctuations
docs2 <- tm_map(docs1, removePunctuation)
# Supprimer les espaces vides supplémentaires
docs3 <- tm_map(docs2, stripWhitespace)
# Step 4: Stem it
docs3.temp <- tm_map(docs3, stemDocument, language = "french")  
inspect(docs3.temp)

dtm <- TermDocumentMatrix(docs3.temp)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
return(docs3.temp)

concordance_file(docs3.temp, '[Université]', encoding = 'UTF8', span = 5)
