R_回帰分析とbreakpoint の変更点 - アールメカブ

アールメカブ


R_回帰分析とbreakpoint の変更点


[[R_Baayen]]

#ref(baayen240.png,nowrap,center,nolink,時代の変曲点)


 # "1994 年の新聞記事"の中で参照された時代を記録したデータ
 head(faz, 3) 
 faz$Distance <- 1:nrow(faz)
 
 plot(log(faz$Distance), log(faz$Frequency + 1), 
   xlab = "log Distance", ylab = "log Frequency")

 # p.237
 faz$LogFrequency <- log(faz$Frequency + 1)
 faz$LogDistance <- log(faz$Distance)
 breakpoint <- log(59)

 faz$ShiftedLogDistance <- faz$LogDistance - breakpoint
 plot(faz$ShiftedLogDistance, faz$LogFrequency, 
   xlab = "log Shifted Distance", 
   ylab = "log Frequnecy")
 
 faz.left <- lm(LogFrequency ~ ShiftedLogDistance,
   data = faz[faz$ShiftedLogDistance <= 0,])
 faz.right <- lm(LogFrequency ~ ShiftedLogDistance,
   data = faz[faz$ShiftedLogDistance >= 0,])
 
 abline(faz.left, lty = 1)
 abline(faz.right, lty = 2)
 

ここでLogFrequency ~ ShiftedLogDistance : PastBreakPoint は
最初の引数で求めた変数のスロープをもとに,データの後半部分()のスロープを求めることを意味する.(Baayen, p.354:6:5)

 faz$PastBreakPoint <- as.factor(
   faz$ShiftedLogDistance > 0)
 faz.both <- lm(LogFrequency ~ ShiftedLogDistance :
   PastBreakPoint, data = faz)
 anova(faz.both, lm(LogFrequency ~ ShiftedLogDistance,
   data = faz))
 
 # p.237
 deviances <- rep(0, nrow(faz) - 1)
 
 for(pos in 1:(nrow(faz) - 1)){
  breakpoint <- log(pos)
  faz$ShiftedLogDistance <- 
    faz$LogDistance - breakpoint
  faz$PastBreakPoint <- as.character(
    faz$ShiftedLogDistance > 0)
  faz.both <- lm(LogFrequency ~ 
    ShiftedLogDistance:PastBreakPoint, data = faz)
  deviances[pos] <- deviance(faz.both)
 }
 # p.239
 
 best <- which(deviances == min(deviances))
 best
 breakpoint <- log(best)
 faz$ShiftedLogDistance <- faz$LogDistance - breakpoint
 faz$PastBreakPoint <- as.character(
   faz$ShiftedLogDistance > 0)
 faz.both <- lm(LogFrequency ~  
   ShiftedLogDistance:PastBreakPoint, data = faz)
 
 par(mfrow = c(2,2), bg = "white")
 
 plot(log(1:length(deviances)), deviances, type = "l",
   xlab = "breakpoint",ylab = "deviance")
 plot( faz$LogDistance, faz$LogFrequency, type = "l",
  xlab = "Log Distance",ylab = "Log Frequency",
  col = "darkgrey")
 lines(faz$LogDistance, fitted(faz.both))