Rの正規表現 のバックアップ差分(No.3) - アールメカブ

アールメカブ


Rの正規表現 のバックアップ差分(No.3)


  • 追加された行はこの色です。
  • 削除された行はこの色です。
以下,整理する予定
#contents

-[R_tmパッケージの使い方]

* tokenize [#m295ecce]
今ひとつ気に入らない処理なのだが.

 alice.raw3 <- strsplit(alice.raw,
   split = "[[:blank:]]|[[:punct:]]" ,
   extended = TRUE, perl = TRUE)[[1]]
 alice.raw3[alice.raw3 != ""]

同じことだが
 alice.raw3 <- unlist(strsplit(alice.raw,
   split = "[[:blank:]]|[[:punct:]]" ,
   extended = TRUE, perl = TRUE))
 alice.raw3[nchar(alice.raw3) > 0]


 

* 以下,整理する予定 [#v6f02d60]

# gsub(alice.raw, "")
cuttokens <- proto(pre = function(this) {
 this$words <- list()
 }, fun = function(this, x) {
    if (is.null(words[[x]]))
      c(x, this)
 })



gsubfn("\\w+", cuttokens, "the dog and the cat are in the house")



library(proto)
library(gsubfn)


pwords <- proto(pre = function(this) {
 this$words <- list()
 }, fun = function(this, x) {
 if (is.null(words[[x]]))
 this$words[[x]] <- 0
 this$words[[x]] <- words[[x]] + 1
 paste0(x, "{", words[[x]], "}")
 })


gsubfn("\\w+",  pwords, "the dog and the cat are in the house do")


pwords2 <- proto(pre = function(this) {
 this$words <- list()
 }, fun = function(this, x) {
 if (is.null(words[[x]]))
 this$words[[x]] <- 0
 this$words[[x]] <- words[[x]] + 1
 list(x, words[[x]])
 })


 strapply("the dog and the cat are in the house", "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))


alice.raw[alice.raw== "no"]
 strapply(alice.raw, "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))

pwords3 <- proto(pre = function(this) {
 this$words <- list()
 }, fun = function(this, x) {
 if (is.null(words[[x]]))
 this$words[[x]] <- 0
 this$words[[x]] <- words[[x]] + 1
 list(x, words[[x]])
 })


#  inspect(alice.DC2)[[1]]
alice.raw
alice.raw2 <- gsub(",","",alice.raw)
alice.raw3 <- strsplit(alice.raw, split = "[[:blank:]]|[[:punct:]]" , extended = TRUE, perl = TRUE)[[1]]
alice.raw3[alice.raw3 != ""]

#  strapply(alice.raw, "\\w+", pwords2, combine = list, simplify = x ~ do.call(rbind, x))

# gsubfn("\\w+", pwords2, alice.raw)

########### </ tokenize する> 

 alice.raw <-
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,'thought Alice `without pictures or conversation?'"
 strsplit(alice.raw, split = "[[:blank:]]|[[:punct:]]" ,
    extended = TRUE, perl = TRUE)[[1]]
#これだとマッチしたところで空ベクトルができてしまうよう.また ''gsubfn'' パッケージを使うと

 strapply(alice.raw, "\\w+", pwords2, combine = list,
   simplify = x ~ do.call(rbind, x))
# 正しく分割するが,頻度カウントでは部分一致を行ってしまう模様
 
##
 install.packages("gsubfn")
 install.packages("proto")
 library("gsubfn")
 demo("gsubfn-gries")


### Examples

 grep("[a-z]", letters)

 txt <- c("arm","foot","lefroo", "bafoobar")
 if(any(i <- grep("foo",txt)))
   cat("'foo' appears at least once in\n\t",txt,"\n")
 i # 2 and 4
 txt[i]
 
 jp.txt <- c("山本五十六","山下清","山田太郎")
 j <- grep("山本",jp.txt)
 jp.txt[j]

## Double all 'a' or 'b's;  "\" must be escaped, i.e., 'doubled'
 gsub("([ab])", "\\1_\\1_", "abc and ABC")
## aかbがあれば,それをアンダーバーでつなげて二重化する
# \\1 は perl での $1に対応
 gsub("([ab])", "\\$1_\\$1_", "abc and ABC", perl = TRUE)
# Warning message:
# perl=TRUE は UTF-8 ロケールに対してのみ完全実装されています

 gsub("[山]", "石", "山田太郎")


 txt <- c("The", "licenses", "for", "most", "software", "are",
  "designed", "to", "take", "away", "your", "freedom",
  "to", "share", "and", "change", "it.",
   "", "By", "contrast,", "the", "GNU", "General", "Public",
   "License",
   "is", "intended", "to", "guarantee", "your", "freedom",
   "to",
   "share", "and", "change", "free", "software", "--",
   "to", "make", "sure", "the", "software", "is",
   "free", "for", "all", "its", "users")
 ( i <- grep("[gu]", txt) ) # indices
 stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )

## Note that in locales such as en_US this includes B as the
## collation order is aAbBcCdEe ...
 (ot <- sub("[b-e]",".", txt))
 txt[ot != gsub("[b-e]",".", txt)]
   #- gsub does "global" substitution
 
 txt[gsub("g","#", txt) !=
    gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
 
 regexpr("en", txt)
 
 gregexpr("e", txt)

## trim trailing white space
 str.test <- 'Now is the time      '
 sub(' +$', '', str.test)  ## spaces only
 sub(' [[:space:]]+$', '', str.test) ## white space, POSIX-style
 sub('\\s+$', '', str, perl = TRUE) ## Perl-style white space

## capitalizing
 gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", "a test of capitalizing",
    perl=TRUE)
 gsub("\\b(\\w)", "\\U\\1", "a test of capitalizing", perl=TRUE)

########## gsubfn
## http://cran.r-project.org/src/contrib/Descriptions/gsubfn.html
 library(gsubfn)
#
 s <- "abc 10:20 def 30:40 50"
 gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s,
    backref = -2)
 gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s)
# gsubfn("([0-9]+):([0-9]+)", ~as.numeric(x) + as.numeric(y), s,
   backref = 2)

# demo("gsubfn-si")

 dat <- c("3.5G", "88P", "19")
 gsubfn("[MGP]$", ~c(M = "e6", G = "e9", P = "e12")[[x]], dat)
 
 p <- proto(fun = function(this, x) paste0(x, "{", count, "}"))
## count については ?paste0 を見よ
 class(p)
 
 ls(p)
 
 with(p, fun)
 
 s <- c("the dog and the cat are in the house", "x y x")
 gsubfn("\\w+", p, s)

##### Rwiki での関数 proto の説明
 x <- matrix(1:25,5) # 説明用のオブジェクト
 x
 incrb <- function(y) y[,1] <- y[,1]+1
 incrb(x) # 引数をコピーして,コピーに操作を加える
 x # なのでx は変化しない


 incrc <- function(x) {x[,1] <- x[,1]+1; return(x)}
 # 変更した x のコピーを返す
 x <- incrc(x)
 # x -> x のコピー -> x という順序で x が変更される
 x               # 変更後

############## 
 prt <- proto(mat = matrix(1:25, 5))
 ls(prt)
 prt$mat
 incr <- function(x) with(x, mat[,1] <- mat[,1] + 1)
 # テスト用関数、一見終了後は何も変化は無いように見える
 incr(prt)# proto オブジェクト p に関数 incr を実行
 prt$mat# p の mat 成分が何時の間にか変更されている!
 
 cut.x <- as.character(cut(seq(1, 100), 5))
 colMeans(strapply(cut.x, "[^][(),]+", as.numeric,
  simplify = TRUE))
 
 fn$lapply(list(1:4, 1:5), ~LETTERS[x])



### 正規表現を使って区間をいじる ## 代表値の作成
library(gsubfn)

 freq.dat <- read.table("data3/newspan3/aitobi3.span3.dat")
 
 freq.table <- table( factor(cut(freq.dat$V1, breaks= seq(0,
    max(freq.dat)+3, 3))))
 freq.lab <- labels(table( factor(cut(freq.dat$V1,
    breaks= seq(0, max(freq.dat)+3, 3)))))[[1]]
 as.numeric(freq.lab)
 
 freq.matrix <- strapply(freq.lab, "[^][(),]+", as.numeric,
    simplify = TRUE)
 
 freq.repr <- numeric( ncol(freq.matrix))
 for(i in 0:ncol(freq.matrix)){
 #  print(freq.matrix[2,i] - 1)
  freq.repr[i] <- floor((freq.matrix[2,i] - 1) / 3  )
 }
 
 data.frame(repr = freq.repr, freq =  as.numeric(freq.table))


 out.file3 <- paste(out.dir, "newspan3/", text.text,
    data.ext, ".span3", sep = "") # 区間幅 3
 freq.span <-  as.numeric(dimnames(table(buntyo))[[1]])
 write.table(table( factor(cut(buntyo$V1, breaks= seq(0,
    max(buntyo)+3, 3)))),
   file = out.file3, row.names = FALSE, col.names = FALSE,
      quote = FALSE)
      system(paste("./altmann3.pl", out.file3))
       system(paste("rm",  out.file3))
      system(paste("qkc -m -s ", out.file3, ".dat", sep=""))


 library(Rstem)
 getStemLanguages()
 
 gesetz <- readLines("p2007/bsj/vorDemGesezt.txt")
 length(gesetz)
 
 gesetz <- fifo("p2007/bsj/vorDemGesezt.txt")
 
 wordStem(gesetz[5], language = "german")
 
 gesetz <- file(p2007/bsj/vorDemGesezt.txt")
 close(gesetz)
 readChar(gesetz[5])
 
 wordStem("this was a pen", lang = "english")
 wordStem(c("these", "are", "books"), lang = "english")
 wordStem(c("He", "liked", "apples"), lang = "english")
 wordStem(c("win", "winning", 'winner'))
 wordStem(c("Er", "liest", "das", "Buch"),
    language = "german")

## test fixed-length strings
   zz <- file("testchar", "wb")
 x <- c("a", "this will be truncated", "abc")
 nc <- c(3, 10, 3)
 zz <- file("testchar", "rb")
 readChar(gesetz, nc)
 readChar(zz, nchar(x)+3)
 # need to read the terminator explicitly
 
 close(zz)
 unlink("testchar")

# system.file()
gsub()


 alice <-
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do"
 z <- unlist(strsplit(alice, split =" "))
 length(z)
 
 strsplit(alice, split = c(" ",","))
 strsplit(alice, split = (" , "))
 con <- file("test.txt")
 #
 open(con)
 for(i in 1:4) {
  x <- readLines(con, n=1)
  if(length(x)> 0){
    print(x)
  }
 }
 close(con)
 #
 con <- file("test.txt")
 #
 
 con <- file("test.txt")
 open(con)
 
 z <- 1
 while(z){
  x <- readLines(con, n=1)
  if(length(x)> 0){
    print(x)
  }else{
    z <- 0
    break;
  }
 }
 
 close(con)

 con <- file("test.txt")
 open(con)
 while(1){
  x <- readLines(con, n=1)
  if(length(x)> 0){
    print(x)
  }else{
    break;
  }
 }
 close(con)

### 読み込むたびにスペースで区切る
 con <- file("test.txt")
 open(con)
 while(1){
  x <- readLines(con, n=1)
  if(length(x)> 0){
    z <- unlist(strsplit(x, " "))
    for(j in 1:length(z)){
      print(z[j])
    }
  }else{
    break;
  }
 }
 close(con)
### 読み込むたびに stemming する
 con <- file("test.txt")
 open(con)
 while(1){
  x <- readLines(con, n=1)
  if(length(x) == 0 || x == ""){
    break;
  }
  else{
    x <-  gsub("\\.|\\,|\\?|\\!", "",  x)
    if(length(x)> 0 ){
      z <- unlist(strsplit(x, " "))
      for(j in 1:length(z)){
        cat(wordStem(z[j], lang = "english"), "\n")
        #print(z[j])
      }
    }
  }
 }
 close(con)


 s <-
"These books are good. Our dogs, are, pretty. Oh! my gods!?"
 
 s <- gsub("\\.|\\,|\\?|\\!", "",  s)
 strsplit(s, " ")
# if(as.logical(s) != NA) print("NA")
 s <- ""
 length(s)
 if(s == "") print ("empty")


 x <- rpois(100, lam = 5)# いい加減な乱数で
 table(cut(x, breaks = seq(0, max(x), 3)))# 頻度表を作る
 table(cut(x, breaks = seq(0, max(x), 3), labels = F))
 prop.table(cut(x, breaks = seq(0, max(x), 3)))#
 as.numeric(noquote(labels(table(cut(x,
    breaks = seq(0, max(x), 3)))))[[1]])
 
 x <- c(1,1,1,1,2, 5,5,5, 6,6)
 table(cut(x, breaks = 0:max(x)))
 table(cut(x, breaks = 0:max(x), lab = F))# 頻度表を作る
 as.numeric(labels(table(cut(x,
    breaks = 0:max(x), lab = F)))[[1]])
 table(cut(x, breaks = 0:max(x), lab = 0:max(x)))
 # 頻度表を作る