1 2# linguistic applications by Stefan Th. Gries 3 4# create word frequency list from the gsubfn COPYING file 5 6fn1 <- system.file("lipsum.txt", package = "gsubfn") 7Lines1 <- tolower(scan(fn1, what = "char", sep = "\n")) 8tail(sort(table(unlist(strapply(Lines1, "\\w+", perl = TRUE))))) 9 10# frequency list of words from an SGML-annotated text file 11# sampled from the British National Corpus" 12 13fn2 <- system.file("sample.txt", package = "gsubfn") 14Lines2 <- scan(fn2, what = "char", sep = "\n") 15tagged.corpus.sentences <- grep("^<s n=", Lines2, value = TRUE) 16# just to see what it looks like 17tagged.corpus.sentences[c(3, 8)] 18words <- unlist(strapply(tagged.corpus.sentences, ">([^<]*)")) 19words <- gsub(" $", "", words) 20tail(words, 25) 21 22# frequency list of words AND tags from same file 23 24word.tag.pairs <- unlist(strapply(tagged.corpus.sentences, "<[^<]*")) 25cleaned.word.tag.pairs <- grep("<w ", word.tag.pairs, value = TRUE) 26cleaned.word.tag.pairs <- gsub(" +$", "", cleaned.word.tag.pairs) 27tail(sort(table(cleaned.word.tag.pairs))) 28tail(cleaned.word.tag.pairs) 29 30