Rの正規表現
- [R_tmパッケージの使い方]
_ tokenize
今ひとつ気に入らない処理なのだが.
alice.raw <- readLines("alice.txt") alice.raw3 <- strsplit(alice.raw, split = "[[:blank:]]|[[:punct:]]" , extended = TRUE, perl = TRUE)[[1]] alice.raw3[alice.raw3 != ""]
同じことだが
alice.raw3 <- unlist(strsplit(alice.raw, split = "[[:blank:]]|[[:punct:]]" , extended = TRUE, perl = TRUE)) alice.raw3[nchar(alice.raw3) > 0]
_ 以下,整理する予定
# gsub(alice.raw, "") cuttokens <- proto(pre = function(this) {
this$words <- list() }, fun = function(this, x) { if (is.null(words[[x]])) c(x, this) })
gsubfn("\\w+", cuttokens, "the dog and the cat are in the house")
library(proto) library(gsubfn)
pwords <- proto(pre = function(this) {
this$words <- list() }, fun = function(this, x) { if (is.null(words[[x]])) this$words[[x]] <- 0 this$words[[x]] <- words[[x]] + 1 paste0(x, "{", words[[x]], "}") })
gsubfn("\\w+", pwords, "the dog and the cat are in the house do")
pwords2 <- proto(pre = function(this) {
this$words <- list() }, fun = function(this, x) { if (is.null(words[[x]])) this$words[[x]] <- 0 this$words[[x]] <- words[[x]] + 1 list(x, words[[x]]) })
strapply("the dog and the cat are in the house", "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))
alice.raw[alice.raw== "no"]
strapply(alice.raw, "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))
pwords3 <- proto(pre = function(this) {
this$words <- list() }, fun = function(this, x) { if (is.null(words[[x]])) this$words[[x]] <- 0 this$words[[x]] <- words[[x]] + 1 list(x, words[[x]]) })
# inspect(alice.DC2)1? alice.raw alice.raw2 <- gsub(",","",alice.raw) alice.raw3 <- strsplit(alice.raw, split = ":blank:?|:punct:?" , extended = TRUE, perl = TRUE)1? alice.raw3[alice.raw3 != ""]
# strapply(alice.raw, "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))
# gsubfn("\\w+", pwords2, alice.raw)
########### </ tokenize する>
alice.raw <-
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,'thought Alice `without pictures or conversation?'"
strsplit(alice.raw, split = "[[:blank:]]|[[:punct:]]" , extended = TRUE, perl = TRUE)[[1]]
#これだとマッチしたところで空ベクトルができてしまうよう.また gsubfn パッケージを使うと
strapply(alice.raw, "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))
# 正しく分割するが,頻度カウントでは部分一致を行ってしまう模様
##
install.packages("gsubfn") install.packages("proto") library("gsubfn") demo("gsubfn-gries")
### Examples
grep("[a-z]", letters)
txt <- c("arm","foot","lefroo", "bafoobar") if(any(i <- grep("foo",txt))) cat("'foo' appears at least once in\n\t",txt,"\n") i # 2 and 4 txt[i] jp.txt <- c("山本五十六","山下清","山田太郎") j <- grep("山本",jp.txt) jp.txt[j]
## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\1_\\1_", "abc and ABC")
## aかbがあれば,それをアンダーバーでつなげて二重化する
# \\1 は perl での $1に対応
gsub("([ab])", "\\$1_\\$1_", "abc and ABC", perl = TRUE)
# Warning message:
# perl=TRUE は UTF-8 ロケールに対してのみ完全実装されています
gsub("[山]", "石", "山田太郎")
txt <- c("The", "licenses", "for", "most", "software", "are", "designed", "to", "take", "away", "your", "freedom", "to", "share", "and", "change", "it.", "", "By", "contrast,", "the", "GNU", "General", "Public", "License", "is", "intended", "to", "guarantee", "your", "freedom", "to", "share", "and", "change", "free", "software", "--", "to", "make", "sure", "the", "software", "is", "free", "for", "all", "its", "users") ( i <- grep("[gu]", txt) ) # indices stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )
## Note that in locales such as en_US this includes B as the
## collation order is aAbBcCdEe? ...
(ot <- sub("[b-e]",".", txt)) txt[ot != gsub("[b-e]",".", txt)] #- gsub does "global" substitution txt[gsub("g","#", txt) != gsub("g","#", txt, ignore.case = TRUE)] # the "G" words regexpr("en", txt) gregexpr("e", txt)
## trim trailing white space
str.test <- 'Now is the time ' sub(' +$', '', str.test) ## spaces only sub(' [[:space:]]+$', '', str.test) ## white space, POSIX-style sub('\\s+$', '', str, perl = TRUE) ## Perl-style white space
## capitalizing
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", "a test of capitalizing", perl=TRUE) gsub("\\b(\\w)", "\\U\\1", "a test of capitalizing", perl=TRUE)
########## gsubfn
## http://cran.r-project.org/src/contrib/Descriptions/gsubfn.html
library(gsubfn)
#
s <- "abc 10:20 def 30:40 50" gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s, backref = -2) gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s)
# gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s,
backref = 2)
# demo("gsubfn-si")
dat <- c("3.5G", "88P", "19") gsubfn("[MGP]$", ~c(M = "e6", G = "e9", P = "e12")[[x]], dat) p <- proto(fun = function(this, x) paste0(x, "{", count, "}"))
## count については ?paste0 を見よ
class(p) ls(p) with(p, fun) s <- c("the dog and the cat are in the house", "x y x") gsubfn("\\w+", p, s)
##### Rwiki での関数 proto の説明
x <- matrix(1:25,5) # 説明用のオブジェクト x incrb <- function(y) y[,1] <- y[,1]+1 incrb(x) # 引数をコピーして,コピーに操作を加える x # なのでx は変化しない
incrc <- function(x) {x[,1] <- x[,1]+1; return(x)} # 変更した x のコピーを返す x <- incrc(x) # x -> x のコピー -> x という順序で x が変更される x # 変更後
##############
prt <- proto(mat = matrix(1:25, 5)) ls(prt) prt$mat incr <- function(x) with(x, mat[,1] <- mat[,1] + 1) # テスト用関数、一見終了後は何も変化は無いように見える incr(prt)# proto オブジェクト p に関数 incr を実行 prt$mat# p の mat 成分が何時の間にか変更されている! cut.x <- as.character(cut(seq(1, 100), 5)) colMeans(strapply(cut.x, "[^][(),]+", as.numeric, simplify = TRUE)) fn$lapply(list(1:4, 1:5), ~LETTERS[x])
### 正規表現を使って区間をいじる ## 代表値の作成 library(gsubfn)
freq.dat <- read.table("data3/newspan3/aitobi3.span3.dat") freq.table <- table( factor(cut(freq.dat$V1, breaks= seq(0, max(freq.dat)+3, 3)))) freq.lab <- labels(table( factor(cut(freq.dat$V1, breaks= seq(0, max(freq.dat)+3, 3)))))[[1]] as.numeric(freq.lab) freq.matrix <- strapply(freq.lab, "[^][(),]+", as.numeric, simplify = TRUE) freq.repr <- numeric( ncol(freq.matrix)) for(i in 0:ncol(freq.matrix)){ # print(freq.matrix[2,i] - 1) freq.repr[i] <- floor((freq.matrix[2,i] - 1) / 3 ) } data.frame(repr = freq.repr, freq = as.numeric(freq.table))
out.file3 <- paste(out.dir, "newspan3/", text.text, data.ext, ".span3", sep = "") # 区間幅 3 freq.span <- as.numeric(dimnames(table(buntyo))[[1]]) write.table(table( factor(cut(buntyo$V1, breaks= seq(0, max(buntyo)+3, 3)))), file = out.file3, row.names = FALSE, col.names = FALSE, quote = FALSE) system(paste("./altmann3.pl", out.file3)) system(paste("rm", out.file3)) system(paste("qkc -m -s ", out.file3, ".dat", sep=""))
library(Rstem) getStemLanguages() gesetz <- readLines("p2007/bsj/vorDemGesezt.txt") length(gesetz) gesetz <- fifo("p2007/bsj/vorDemGesezt.txt") wordStem(gesetz[5], language = "german") gesetz <- file(p2007/bsj/vorDemGesezt.txt") close(gesetz) readChar(gesetz[5]) wordStem("this was a pen", lang = "english") wordStem(c("these", "are", "books"), lang = "english") wordStem(c("He", "liked", "apples"), lang = "english") wordStem(c("win", "winning", 'winner')) wordStem(c("Er", "liest", "das", "Buch"), language = "german")
## test fixed-length strings
zz <- file("testchar", "wb") x <- c("a", "this will be truncated", "abc") nc <- c(3, 10, 3) zz <- file("testchar", "rb") readChar(gesetz, nc) readChar(zz, nchar(x)+3) # need to read the terminator explicitly close(zz) unlink("testchar")
# system.file() gsub()
alice <-
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do"
z <- unlist(strsplit(alice, split =" ")) length(z) strsplit(alice, split = c(" ",",")) strsplit(alice, split = (" , ")) con <- file("test.txt") # open(con) for(i in 1:4) { x <- readLines(con, n=1) if(length(x)> 0){ print(x) } } close(con) # con <- file("test.txt") # con <- file("test.txt") open(con) z <- 1 while(z){ x <- readLines(con, n=1) if(length(x)> 0){ print(x) }else{ z <- 0 break; } } close(con)
con <- file("test.txt") open(con) while(1){ x <- readLines(con, n=1) if(length(x)> 0){ print(x) }else{ break; } } close(con)
### 読み込むたびにスペースで区切る
con <- file("test.txt") open(con) while(1){ x <- readLines(con, n=1) if(length(x)> 0){ z <- unlist(strsplit(x, " ")) for(j in 1:length(z)){ print(z[j]) } }else{ break; } } close(con)
### 読み込むたびに stemming する
con <- file("test.txt") open(con) while(1){ x <- readLines(con, n=1) if(length(x) == 0 || x == ""){ break; } else{ x <- gsub("\\.|\\,|\\?|\\!", "", x) if(length(x)> 0 ){ z <- unlist(strsplit(x, " ")) for(j in 1:length(z)){ cat(wordStem(z[j], lang = "english"), "\n") #print(z[j]) } } } } close(con)
s <-
"These books are good. Our dogs, are, pretty. Oh! my gods!?"
s <- gsub("\\.|\\,|\\?|\\!", "", s) strsplit(s, " ")
# if(as.logical(s) != NA) print("NA")
s <- "" length(s) if(s == "") print ("empty")
x <- rpois(100, lam = 5)# いい加減な乱数で table(cut(x, breaks = seq(0, max(x), 3)))# 頻度表を作る table(cut(x, breaks = seq(0, max(x), 3), labels = F)) prop.table(cut(x, breaks = seq(0, max(x), 3)))# as.numeric(noquote(labels(table(cut(x, breaks = seq(0, max(x), 3)))))[[1]]) x <- c(1,1,1,1,2, 5,5,5, 6,6) table(cut(x, breaks = 0:max(x))) table(cut(x, breaks = 0:max(x), lab = F))# 頻度表を作る as.numeric(labels(table(cut(x, breaks = 0:max(x), lab = F)))[[1]]) table(cut(x, breaks = 0:max(x), lab = 0:max(x))) # 頻度表を作る