1
2# linguistic applications by Stefan Th. Gries
3
4# create word frequency list from the gsubfn COPYING file
5
6fn1 <- system.file("lipsum.txt", package = "gsubfn")
7Lines1 <- tolower(scan(fn1, what = "char", sep = "\n"))
8tail(sort(table(unlist(strapply(Lines1, "\\w+", perl = TRUE)))))
9
10# frequency list of words from an SGML-annotated text file
11# sampled from the British National Corpus"
12
13fn2 <- system.file("sample.txt", package = "gsubfn")
14Lines2 <- scan(fn2, what = "char", sep = "\n")
15tagged.corpus.sentences <- grep("^<s n=", Lines2, value = TRUE)
16# just to see what it looks like
17tagged.corpus.sentences[c(3, 8)]
18words <- unlist(strapply(tagged.corpus.sentences, ">([^<]*)"))
19words <- gsub(" $", "", words)
20tail(words, 25)
21
22# frequency list of words AND tags from same file
23
24word.tag.pairs <- unlist(strapply(tagged.corpus.sentences, "<[^<]*"))
25cleaned.word.tag.pairs <- grep("<w ", word.tag.pairs, value = TRUE)
26cleaned.word.tag.pairs <- gsub(" +$", "", cleaned.word.tag.pairs)
27tail(sort(table(cleaned.word.tag.pairs)))
28tail(cleaned.word.tag.pairs)
29
30