.packageName <- "RCaBoCha"
# 2008 08 28 #
RCaBoCha <-
function( str1 ){
#   gc()
   if( any( nchar( str1 ) < 1 )){
     stop("first argument must be specified")
   }
  .Call("RCaBoCha", as.character(str1) ,  PACKAGE="RCaBoCha")
}
# 2008 08 28
RCaBoChaCnt<-
function( str1 ){
#   gc()
   if( any( nchar( str1 ) < 1 ) ){
        stop("file argument must be specified")
   }
   else if(!file.exists(str1)){
     stop("file argument must be specified")
   }

  
  .Call("RCaBoChaCnt", as.character(str1) ,  PACKAGE="RCaBoCha")
}

RCaBoChaDF <-
  function( charVec = c("南瓜","基広"),  rmT = c("記号"), str2 = "ない", minFreq = 1, weight = "no" ) {
    if(  length(charVec) < 1 ){
       stop("character vector must be specified.")
    }else{
      charLeng <- length(charVec)
    }
    if( any(nchar(rmT) < 1) ||  length(rmT) < 1 ) {
      stop("romved Terms must be specified.")
    }else{
      rmTN <- length(rmT)
    }
    if(  (!is.numeric(minFreq) ) || minFreq < 1){
      stop("minFreq > 0 must be specified.")     
    }
    if( any( nchar(str2) < 1) ){
      stop("str2 must be specified.")
    }

     
    dtm <- .Call("RCaBoChaDF", as.character(charVec), as.numeric(charLeng), rmT,  as.numeric(rmTN),  as.character(str2), as.numeric(minFreq), PACKAGE="RCaBoCha")

    if(is.null(dtm)){
      stop("give less number to minFreq!")
    }

    dtm <- t(dtm)
#  environment(dtm) = new.env()
## ##   class(dtm) <- "RCaBoChaMatrix"

##   ######### < 2008 05 04 uncommented>
    if(weight == ""){
      invisible( dtm)
      break
    }
    argW <- unlist(strsplit(weight, "*", fixed = T))

    for(i in 1:length(argW)){
      if(argW[i] == "no"){
        invisible( dtm)  
 ##        cat("Term Document Matrix includes 2 information rows!", "\n")
##         cat("whose names are [[LESS-THAN-", minFreq,"]] and [[TOTAL-TOKENS]]", "\n", sep = "")
##         cat("if you remove these rows, execute", "\n", "result[ row.names(result) !=  \"[[LESS-THAN-", minFreq, "]]\" , ]", "\n", "result[ row.names(result) !=  \"[[TOTAL-TOKENS]]\" , ]","\n" , sep = "")
        break
      }else if(argW[i] == "tf"){
        dtm <- localTF(dtm)
      }else if(argW[i] == "tf2"){
        dtm <- localLogTF(dtm)
      }else if(argW[i] == "tf3"){
        dtm <- localBin(dtm)
      }else if(argW[i] == "idf"){
        dtm <- dtm * globalIDF(dtm)
      }else if(argW[i] == "idf2"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf3"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf4"){
        dtm <- dtm * globalEntropy(dtm)
      } else if(argW[i] == "norm"){
        
        dtm <- t(t(dtm) * mynorm(dtm))
      }
    }
    if(any(is.nan( dtm))){
      cat("Warning! Term document matrix includes NaN!", "\n")
    }
    invisible( dtm)  
  }
# 2008 08 28
RCaBoChaFile<-
function( str1, rmT = c("記号"), str2 = "ない" ){
#   gc()
  if( any( nchar( str1 ) < 1 ) ){
    stop("first argument must be specified")
   }
  if(!file.exists(str1)){
    stop("first argument must be specified")
  }
  if( any( nchar( str2 ) < 1 ) ){
    stop("second argument must be specified")
  }
  if( any(nchar(rmT) < 1) || length(rmT) < 1){
    stop("rmT argument must be specified")
  }else{
    rmTN <- length(rmT)
  }
  
  .Call("RCaBoChaFile", as.character(str1)  , as.character(rmT) ,  as.numeric(rmTN) ,  as.character(str2),  PACKAGE="RCaBoCha")
}
# 2008 08 28 # 
RCaBoChaFreq <-
function( str1 = "南瓜", str2 = "ない" ){
#   gc()
   if( any(nchar( str1 ) < 1) ){
     stop("first argument must be specified")
   }
   if( any( nchar( str2 ) < 1 )){
     stop("second argument must be specified")
   }
  
  .Call("RCaBoChaFreq", as.character(str1) , as.character(str2) ,  PACKAGE="RCaBoCha")
}
#RCaBoChaMx(directory, pos= c("記号"), str2 = "ない",  minFreq = 1, weight = "no")
RCaBoChaMx <-
  function( directory,  rmT= c("記号"), str2 = "ない",   minFreq = 1, weight = "no" ) {
    rmTN <- length(rmT)
#    gc()
    if(any(suppressWarnings(dir(directory) ) > 0)){
      ft <- 1 ##ディレクトリが指定された
	file <- dir(directory)
    } else if (file.exists(directory)){
      ft <- 0 # 単独ファイル
      file <- basename(directory)
      directory <- dirname(directory)

    } else{
      stop("specify directory or a file!")
    }
    fileN = length(file)

    if( any(nchar(rmT) < 1)  || rmTN  < 1){
      stop("specify rm argument")
    } else if("記号" %in% rmT){
      sym = 1 # 記号を頻度に含めて出力する
    }
    if( (!is.numeric(minFreq) ) || minFreq < 1){
      stop("minFreq argument must be equal to or larger than 1!")      
    }

    
    dtm <- .Call("RCaBoChaMx", as.character(directory), as.character(file), as.numeric(fileN), as.numeric(ft), as.character(rmT), as.numeric(rmTN), as.character(str2),  as.numeric(minFreq), PACKAGE="RCaBoCha")

    if(is.null(dtm)){
      stop("chage the value of minFreq argument!")
    }

    dtm <- t(dtm)
#  environment(dtm) = new.env()
## ##   class(dtm) <- "RCaBoChaMatrix"

##   ######### < 2008 05 04 uncommented>
    if(weight == ""){
      invisible( dtm)
      break
    }
    argW <- unlist(strsplit(weight, "*", fixed = T))

    for(i in 1:length(argW)){
      if(argW[i] == "no"){
        invisible( dtm)  
 ##        cat("Term Document Matrix includes 2 information rows!", "\n")
##         cat("whose names are [[LESS-THAN-", minFreq,"]] and [[TOTAL-TOKENS]]", "\n", sep = "")
##         cat("if you remove these rows, execute", "\n", "result[ row.names(result) !=  \"[[LESS-THAN-", minFreq, "]]\" , ]", "\n", "result[ row.names(result) !=  \"[[TOTAL-TOKENS]]\" , ]","\n" , sep = "")
        break
      }else if(argW[i] == "tf"){
        dtm <- localTF(dtm)
      }else if(argW[i] == "tf2"){
        dtm <- localLogTF(dtm)
      }else if(argW[i] == "tf3"){
        dtm <- localBin(dtm)
      }else if(argW[i] == "idf"){
        dtm <- dtm * globalIDF(dtm)
      }else if(argW[i] == "idf2"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf3"){
        dtm <- dtm * globalIDF2(dtm)
      }else if(argW[i] == "idf4"){
        dtm <- dtm * globalEntropy(dtm)
      } else if(argW[i] == "norm"){
        
        dtm <- t(t(dtm) * mynorm(dtm))
      }
    }
    if(any(is.nan( dtm))){
      cat("Warning! Term document matrix includes NaN!", "\n")
    }
    invisible( dtm)  
  }
#
# 2008 May 5 09:56
#
## ##  @Book{kitaB02,
##   author =	 {北 研二 and 津田 和彦 and 獅子堀 正幹},
##   yomi   ={Kenji Kita},
##   title = 	 {情報検索アルゴリズム},
##   publisher = 	 {共立出版},
##   year = 	 2002
## }
####################################
entropy <-
  function (m) {
    m <- removeInfo ( m ) 
    gf = rowSums(m, na.rm = TRUE)  # 大域的頻度 F_i
    p = m / gf                     # 各出現頻度 / 大域頻度
    ndocs = ncol(m)
    entropy = - rowSums( (p*log(p)) / log(ndocs), na.rm = TRUE )
    return ( entropy )
  }

####################################################
### local weights ###

# binary weights
localBin <-
  function(m) {
    m <- removeInfo ( m ) 
    return( (m>0)*1 )
  }

# termfrequency
localTF <-
  function(m) {
    m <- removeInfo ( m ) 
    return( m )
  }

# log'ed termfrequency
localLogTF <- function(m) {
  m <- removeInfo ( m ) 
  return( log(m+1) )
}

#
#######################################################
### glocal weights ###
#
# inverse document frequency

globalIDF <-
  function(m) {

#    m <- removeInfo ( m ) 
#    df = rowSums(localBin(m), na.rm=TRUE)
#    return ( log2(ncol(m)/df)  )
    df <- rowSums(localBin( m ), na.rm=TRUE)# 各単語を含む文書数 (文書頻度 n_i)
    return ( ( log2(ncol( removeInfo(m) )/df) + 1 ) )
  }


# global frequency * IDF
globalIDF2 <-
  function(m) {
#     m <- removeInfo ( m )
        
    gf <- rowSums( removeInfo( m ) , na.rm = TRUE)# 大域的頻度 F_i
    df <- rowSums(localBin( m ), na.rm=TRUE)      # 文書頻度  n_i
    return ( gf/df )
  }

# probabilistic IDF
globalIDF3 <-
  function(m) {
#     m <- removeInfo ( m )
    df <- rowSums(localBin( m ), na.rm=TRUE)# 各単語を含む文書数 (文書頻度)
    return ( log2 ( ncol( removeInfo(m) - df) /df  ) )
  }

# global weighting = 1 + entropy
globalEntropy <-
  function(m) {
#    m <- removeInfo ( m )
    
    return ( (1 - entropy( m )) )
  }


## # normalisation
globalNorm <-
  function(m) {

    m <- removeInfo ( m ) 
    return ( 1 / sqrt( colSums(( m*m ), na.rm = TRUE) ) )# ノルムで割る
  }

mynorm <-
  function(m){
    return ( 1 / sqrt( colSums(( m*m ), na.rm = TRUE) ) )# ノルムで割る
  }

removeInfo <-
  function(m){
    if(!is.matrix( m) ){
      stop("first argument must be matrix!")
    }
    grep1 <- grep("TOTAL-TOKENS",  rownames( m ))
    grep2 <- grep("LESS-THAN-",    rownames( m ))
    if(length(grep1) == 0 & length(grep2) == 0) {
      return (m)
    } else{ 
      return( m [-c(grep1, grep2), ]  )
    }
    return(m)

  }
#################
## /esearch/PublicPapers/TermDocWeighting
