.packageName <- "RMeCab"
# 2008 03 28 # the third arg omitted
Ngram <-
  function(filename, type = 0, N = 2, pos = "", posN = 0){
#    gc()
    if(! file.exists(filename)){
      stop("file not found")
    }
    if(type != 0 && type != 1 && type != 2){
            stop("type must be 0 or 1 or 2")
    }
    if(type == 1){
      if(length(pos) == 1){
        if(pos == ""){
          posN = 0
        }
        else{
          posN <- length(pos)
        }
      }else if( length(pos) > 2){
        posN <- length(pos)
      }
    }
    .Call("Ngram", as.character(filename), as.integer(type), as.integer(N), as.character(pos), as.integer(posN),   PACKAGE="RMeCab")
}
# 2008 06 14 
NgramDF <-
  function(filename, type = 0, N = 2, pos = "", posN = 0){
#    gc()
    if(! file.exists(filename)){
      stop("file not found")
    }
    if(type != 0 && type != 1 && type != 2){
            stop("type must be 0 or 1 or 2")
    }
    if(type == 1){
      if(length(pos) == 1){
        if(pos == ""){
          posN = 0
        }
        else{
          posN <- length(pos)
        }
      }else if( length(pos) > 2){
        posN <- length(pos)
      }
    }
    .Call("NgramDF", as.character(filename), as.integer(type), as.integer(N), as.character(pos), as.integer(posN),   PACKAGE="RMeCab")
}
# 2008 03 28 # the third arg omitted
RMeCabC <-
function(str, mypref = 0){
#   gc()
   if(nchar(str) < 1){
     stop("first argument must be specified")
   }
  .Call("RMeCabC", as.character(str), as.integer(mypref),  PACKAGE="RMeCab")
}
RMeCabDF <-
  function(dataf, coln, mypref = 0){
    if(!is.data.frame(dataf) ){
      stop("the first argument must be a data frame!")
    }
    if(!is.numeric(coln) || coln > ncol(dataf)){
      stop("the second argument must be integer")
    }
#    gc()
    kekka <- list(length(dataf[,coln]))
    for(i in 1:length(dataf[,coln])){
      if( !is.factor(dataf[i,coln]) || is.na(dataf[i,coln])  ||  dataf[i, coln] == ""){
      # stop("line number %d include non-characters!", i)
        kekka[[i]] <- NA
      }else{
        kekka[[i]] <- unlist(RMeCabC(dataf[i,coln], mypref))# the third arg omitted
      }
    }
    return(kekka)
  }
# 
RMeCabDoc <-
function(filename, mypref = 1){
#  gc()
  if(! file.exists(filename)){
    stop("file not found")
  }

  .Call("RMeCabDoc", as.character(filename) , as.numeric(mypref),  PACKAGE="RMeCab")
}


RMeCabFreq <- function(filename){
#   gc()
   if(! file.exists(filename)){
     stop("file not found")
   }

  .Call("RMeCabFreq", as.character(filename), PACKAGE="RMeCab" )
}
RMeCabMx <-
function(filename, pos, posN, minFreq = 1){

   if(! file.exists(filename)){
     stop("file not found")
   }
   if(length(pos) < 1){
     stop("second argument must be specified.")
   }
   if(posN != length(pos)){
     posN = length(pos)
   }
#   gc()

  .Call("RMeCabMx", as.character(filename) , pos, as.numeric(posN), as.numeric(minFreq), PACKAGE="RMeCab")
}

RMeCabText <-
function(filename){
#   gc()
  if(! file.exists(filename)){
    stop("file not found")
  }
  .Call("RMeCabText", as.character(filename) ,  PACKAGE="RMeCab")
}


## @Book{barnbrook96:_languag_comput,
##   author =		 {Geoff Barnbrook},
##   title = 		 {Language and Computers},
##   publisher = 	 {Edinburgh},
##   year = 		 1996
## }

collScores <-
  function(kekka, node, span = 0){
##   # kekka が collocate 関数の出力かどうかを判断する必要あり
##   # クラス設計完成させるまでは以下でごまかし
  if(  !is.data.frame(kekka) || (num2 <-  which(kekka$Term == "[[MORPHEMS]]")) < 1){
    stop("first argument must be result of collocate()")
  }
  if(nchar(node) < 1){
        stop("second argument (node)  must be a node morpheme")
  }
  if(span == 0){
    stop("third argument (span)  must be specified")
  }

  num1 <- which(kekka$Term == node)                 ## node の行数
  num3 <-  which(kekka$Term == "[[TOKENS]]")        ## 総語数の行数
  spanTokens <- span * 2 *  kekka[num1, "Total"]    ## node の出現回数 * span (前) * span (後)

  ##### T-scoore
   tscore <- (kekka[, "Span"] - ( kekka[, "Total"] / kekka[num3, "Total"] * spanTokens)) / sqrt(kekka[, "Span"]) # (実測値 - 共起語が span に出現する期待値) / 実測値の平方根 ## Barbnrook p.93, p.97
  tscore[c(num1, num2, num3)] <- NA
  kekka$T <- tscore

  ##### MI-score
  mutual <- log2( kekka[, "Span"] /   ( kekka[, "Total"] / kekka[num3, "Total"] * spanTokens) ) # (実測値/ 期待値)の対数
  mutual[c(num1, num2, num3)] <- NA
  kekka$MI <- mutual

  return(kekka)
  
## ##   ########################################
  

## ##   # nodeTotal <-  kekka[num1, "Total"]
## ##   InSpanTotal <-  kekka[num3, "Span"] - kekka[num1, "Span"]# node の頻度を引く
## ##   AllTotal <- kekka[num3, "Total"] - kekka[num1, "Total"]# node の頻度を引く
  
## ## ## #  tscore <- (kekka[-num, "Span"] - (kekka[-num, "Total"] * kekka[num, "Total"] / kekka[num3,"Total"] )) / sqrt(kekka[-num, "Span"])

## ## ## #  tscore <- (kekka[, "Span"] - (kekka[, "Total"] * kekka[num1, "Total"] / kekka[num3,"Total"] )) / sqrt(kekka[, "Span"])
## ##    tscore <- (kekka[, "Span"] - ( kekka[, "Total"] * kekka[num1, "Total"] / AllTotal )) / sqrt(kekka[, "Span"])
 
## ##   tscore[c(num1, num2, num3)] <- NA
## ##   kekka$T <- tscore

## ## ##   # mutual <- log2(  (kekka[, "Span"]* kekka[num3,"Total"] ) /  kekka[num1, "Total"] * kekka[, "Total"])


## ##   mutual <- log2(  (kekka[, "Span"]* AllTotal) /  kekka[num1, "Total"] * kekka[, "Total"])  
## ##   mutual[c(num1, num2, num3)] <- NA 
         
## ##   kekka$MI <- mutual
  
## ## ## ####
## ########## < g2>
## ##   # 以下は，lappy を使った効率的な処理に実装し直す予定
## ##   g2 <- as.numeric(length(kekka$Term))
  
## ##   for(i in 1: (length(kekka$Term)-2) ){# lappy で実装し直す予定
## ##     if (i == num1){## (080509)
## ##       next
## ##     }
## ##     left1 <- kekka[i, "Span"]#  i 番目の Term と node との共起頻度
## ##     # ? right1 <- kekka[num3,"Total"] - left1 # nodeの総共起回数 - left 
## ##     right1 <- kekka[num1, "Total"] - left1 # node の総頻度 - left1 (080509)
## ## ##     # right1 <- kekka[num1, "Total"] - left1 # node の総頻度 - left1
## ## ##     # right1 <- kekka[i,"Total"] -   kekka[i, "Span"]#  i 番目の Term のテキスト全体での総頻度から node との共起頻度を引く
## ##     # ? left2 <- 
## ##     left2 <-  kekka[i, "Total"] - left1 # 共起語の総語数 - node との共起頻度(080509)
## ## ##     #left2 <- InSpanTotal -  kekka[i, "Span"]#
## ## ##     #left2 <- kekka[num3, "Span"] -  kekka[i, "Span"]# node総頻度から対象との共起頻度を引く
## ##     right2 <-  AllTotal - left1 - right1 - left2 # 総token数 - 他三つのセル
## ## ##     # right2 <-  kekka[num3,"Total"] - left1 - right1 - left2 # テキスト全体での頻度から，他のセルの度数を引く
## ## ##     ## print(i); 
## ##     (testMt <- matrix(c(left1, left2, right1, right2), ncol = 2))
## ##     suppressWarnings (g2[i] <- loglin(testMt, c(1,2))$lrt )##
## ##   }

## ##   g2[c(num1, num2, num3)] <- NA   
## ##   kekka$G2 <- g2

## ##   ############## </ g2>
  
}  
collocate <-
function(filename, node, span = 3){
#   gc()
   if(! file.exists(filename)){
     stop("file not found")
   }
   if(nchar(node) < 1){
     stop("second argumet must be specified")
   }

  .Call("collocate", as.character(filename) , as.character(node), as.numeric(span),  PACKAGE="RMeCab")
}


docVector <-
  function (filename, pos, minFreq = 1) {
    if(! file.exists(filename)){
      stop("file not found")
    }
    posN <- length(pos)
## ##     posSet <- 0
## ## ##     for(i in 1:posN){
##      if ("名詞" %in% pos) posN <- posN + 1
##      else if ("形容詞" %in% pos) posN <- posN + 10
##      else if ("動詞" %in% pos) posN <- posN + 100
##      else if("助動詞" %in% pos) posN <- posN + 1000
##      else posN <- 0
## ## #    }
     dummy <- RMeCabMx(filename, pos, posN, minFreq  )
     if(length(dummy) < 1){
       return(NULL)
     }
     else{
       return (data.frame( docs = basename(filename), terms = names(dummy), Freq = as.vector(dummy), row.names = NULL) )
     }
     return(NULL) 
  }



docMatrix <-
  function( mydir,  pos = c("名詞", "動詞","形容詞"), minFreq = 1, weight = "no" ) {

#  gc()
  weight <- weight 
  dummy <- lapply( dir(mydir, full.names=TRUE), docVector, pos, minFreq)
  
  if(length(dummy) == 0){
#    gc()
    stop("no doc-matrix returned.")
  }
  else if(length(dummy) == 1){
    return(dummy)
  }
  
  dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))

#  environment(dtm) = new.env()
## ##   class(dtm) <- "RMeCabMatrix"

##   ######### < 2008 05 04 uncommented>
  argW <- unlist(strsplit(weight, "*", fixed = T))

    for(i in 1:length(argW)){
      if(argW[i] == "no"){
        print("Term Document Matrix includes 2 information rows!")
        break
      }else if(argW[i] == "tf"){
        dtm <- localTF(dtm)
      }else if(argW[i] == "tf2"){
        dtm <- localLogTF(dtm)
      }else if(argW[i] == "tf3"){
        dtm <- localBin(dtm)
      }else if(argW[i] == "idf"){
        dtm <- dtm * globalIDF(dtm)
      }else if(argW[i] == "idf2"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf3"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf4"){
        dtm <- dtm * globalEntropy(dtm)
      } else if(argW[i] == "norm"){
        if(i == 1){
         #dtm <- sweep(localTF(m), 2, globalNorm(m), "*")
          dtm <- removeInfo(dtm)
        }
        dtm <- t(t(dtm) * mynorm(dtm))
      }
    }

##   ############# < 2008 05 04 uncommented>
    
## ##   ### ########## < 2008 05 04 commented>
##   if(weight == "tf*idf"){
##     dtm <- localTF(dtm) * globalIDF(dtm)
##   }else if(weight == "tf*idf*norm") {
##     #dtm <- localTF(dtm)  * globalIDF(dtm) * globalNorm(dtm)
##     tmp <- localTF(dtm)  * globalIDF(dtm)
##     #dtm <- sweep(tmp, 2, mynorm(tmp), "*")
##      dtm <- t(t(tmp) * mynorm(tmp))
##   } else{
##   ##     class(dtm) <-  "docMatrix"
##   ## c("docMatrix", "xtabs", "table")
##   ##     return ( dtm )
##      invisible( dtm)
##   }
##   ### #############</ 2008 05 04 commented>
  
  ##  class(dtm) <-  "docMatrix"
  ## c("docMatrix", "xtabs", "table")
  ##   return ( dtm )
  if(any(is.nan( dtm))){
    print("Warning! Term document matrix includes NaN!")
  }
  invisible( dtm)  
}

print.docMatrix <-
  function(x, ...)
  {
    print(length(x))
  }
makeNgram <-
  function (filename, type = 0, N = 2, pos = "") {
    if(! file.exists(filename)){
      stop("file not found")
    }
    
    dummy <- Ngram(filename, type, N, pos )
    dummy$Text <- rep(basename(filename), length(dummy$Freq))
     if(length(dummy) < 1){
       return(NULL)
     }
     else{
     return(dummy)
   }
  }



docNgram <-
  function( mydir,  type = 0, N = 2, pos = "") {

  dummy <- lapply( dir(mydir, full.names=TRUE), makeNgram, type, N, pos)
  
  if(length(dummy) == 0){
#    gc()
    stop("no doc-matrix returned.")
  }
  else if(length(dummy) == 1){
    return(dummy)
  }
  
  # dtm <- t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
  dtm <- xtabs(Freq ~ ., data = do.call("rbind", dummy))
   invisible( dtm)  
}
# 形態素解析の結果から「記号」を取る
rmSign <- function (x){
  if(!is.list(x)){
    stop("x must be a list")
  }
  for(i in 1:length(x)){
    if(any(is.na(x[[i]]))){
      x[[i]] <- NA
    }
    else{
      tmp <- NULL
      for(j in 1:length(x[[i]])){
        if(names(x[[i]][j]) != "記号"){
          tmp <- c(tmp, j)
        }
      }
      x[[i]] <- x[[i]] [tmp]
      tmp <- NULL
    }
  }
  return (x)
}
#
# 2008 May 5 09:56
#
## ##  @Book{kitaB02,
##   author =	 {北 研二 and 津田 和彦 and 獅子堀 正幹},
##   yomi   ={Kenji Kita},
##   title = 	 {情報検索アルゴリズム},
##   publisher = 	 {共立出版},
##   year = 	 2002
## }
####################################
entropy <-
  function (m) {
    m <- removeInfo ( m ) 
    gf = rowSums(m, na.rm = TRUE)  # 大域的頻度 F_i
    p = m / gf                     # 各出現頻度 / 大域頻度
    ndocs = ncol(m)
    entropy = - rowSums( (p*log(p)) / log(ndocs), na.rm = TRUE )
    return ( entropy )
  }

####################################################
### local weights ###

# binary weights
localBin <-
  function(m) {
    m <- removeInfo ( m ) 
    return( (m>0)*1 )
  }

# termfrequency
localTF <-
  function(m) {
    m <- removeInfo ( m ) 
    return( m )
  }

# log'ed termfrequency
localLogTF <- function(m) {
  m <- removeInfo ( m ) 
  return( log(m+1) )
}

#
#######################################################
### glocal weights ###
#
# inverse document frequency

globalIDF <-
  function(m) {

#    m <- removeInfo ( m ) 
#    df = rowSums(localBin(m), na.rm=TRUE)
#    return ( log2(ncol(m)/df)  )
    df <- rowSums(localBin( m ), na.rm=TRUE)# 各単語を含む文書数 (文書頻度 n_i)
    return ( ( log2(ncol( removeInfo(m) )/df) + 1 ) )
  }


# global frequency * IDF
globalIDF2 <-
  function(m) {
#     m <- removeInfo ( m )
        
    gf <- rowSums( removeInfo( m ) , na.rm = TRUE)# 大域的頻度 F_i
    df <- rowSums(localBin( m ), na.rm=TRUE)      # 文書頻度  n_i
    return ( gf/df )
  }

# probabilistic IDF
globalIDF3 <-
  function(m) {
#     m <- removeInfo ( m )
    df <- rowSums(localBin( m ), na.rm=TRUE)# 各単語を含む文書数 (文書頻度)
    return ( log2 ( ncol( removeInfo(m) - df) /df  ) )
  }

# global weighting = 1 + entropy
globalEntropy <-
  function(m) {
#    m <- removeInfo ( m )
    
    return ( (1 - entropy( m )) )
  }


## # normalisation
globalNorm <-
  function(m) {

    m <- removeInfo ( m ) 
    return ( 1 / sqrt( colSums(( m*m ), na.rm = TRUE) ) )# ノルムで割る
  }

mynorm <-
  function(m){
    return ( 1 / sqrt( colSums(( m*m ), na.rm = TRUE) ) )# ノルムで割る
  }

removeInfo <-
  function(m){
    if(!is.matrix( m) ){
      stop("first argument must be matrix!")
    }
    grep1 <- grep("TOTAL-TOKENS",  rownames( m ))
    grep2 <- grep("LESS-THAN-",    rownames( m ))
    if(length(grep1) == 0 & length(grep2) == 0) {
      return (m)
    } else{ 
      return( m [-c(grep1, grep2), ]  )
    }
    return(m)

  }
#################
## /esearch/PublicPapers/TermDocWeighting
