################################################################# # # # BoP_plot.R - an R script to plot the log file of BoP # # # # This file extracts, reads in and plots the results of the # # "Biology or Physics?" game, taking data from the log at # # www.srcf.ucam.org/~lj237/BoP/log.txt # # # # To load in the functions, simply type "source('grapher.R')" # # at the R terminal. Info can be got by calling the functions # # manually. All the functions have sensible defaults, so they # # can be called without arguments. logurl contains the log url # # # # functions: # # # # answers <- read.answers() - reads the answers (with # # timestamps removed) into object "answers" # # # # results <- get.answers() - gives the number correct, the # # number answered and error bars for each question# # # # get.statistics() - returns the median and modal scores, and # # plots a histogram of scores # # # # plot.answers() - plots a bargraph of % correct for each # # question # # # # plot.hitcount() - plots the number of hit counts over time, # # starting from when it was first announced on # # the blog "Pharyngula" # # # # # # by Luke, 26/5/08 # # # ################################################################# ########## functions ################## read.answers <- function(file = 'http://www.srcf.ucam.org/~lj237/BoP/log.txt'){ # reads in answers from file, and removes timestamps and comment lines # read in the file raw <- scan(file,"char",sep="\n") # find the timestamps and comments stamps <- grep("IP",raw) comments <- grep("#",raw) # return the non-timestamped/commented files return(raw[-union(stamps,comments)]) } string2answers <- function(string,N.questions){ # takes an answer string, and produces numeric answers # 1 for correct, 0 for incorrect, and NA for not answered # split it up into a vector of answers of the form ("Q:C") # where Q is the question and C is whether it was correctly answered (Note, first entry is blank) substring <- strsplit(string," ")[[1]] # for each question, find the value that follows the ":", or NA if it is blank answers <- sapply(2:(N.questions+1),function(x) as.numeric(substr(substring[x],regexpr(":",substring[x])+1,1000))) #return the answers return(answers) } get.answers <- function(data = read.answers(), N.questions = 12){ # takes data, the output of read.answers, and returns the proportion # correct for each answer # produce a matrix of answers, using the string2answers function answer.grid <- sapply(data,string2answers,N.questions=N.questions) # find the number answered (i.e. the number that isn't NA) for each question answered <- apply(!is.na(answer.grid),1,sum) # find the number answered correctly for each question (also, deviation) correct <- apply(answer.grid,1,sum,na.rm=T) errorbars <- sapply(1:N.questions, function(x) find.errorbars(correct[x],answered[x])) # return the ratio of the two if prop=TRUE return(list(correct=correct,answered=answered,errorbars=errorbars)) } find.errorbars <- function(q,N,pval = 0.05){ # finds the error bars of an answer (q out of N), by finding the # lowest and highest possible true scores that give a greater than pval # chance of producing the observed score # initialise upper and lower bounds lower <- q upper <- q # find lower bound k <- 1 while (k > pval){ k <- (1 - pbinom(q,N,lower/N)) lower <- lower - 1 } # find upper bound k <- 1 while (k > pval){ k <- pbinom(q,N,upper/N) upper <- upper + 1 } # return the lower and upper bounds for which k > 0.05 return(c(lower+1,upper-1)) } plot.answers <- function(file = 'http://www.srcf.ucam.org/~lj237/BoP/log.txt', N.questions = 12){ # reads the answers in, calculates the proportion correct # and plots the proportion correct for each question # read in the data and calculate proportion correct data <- read.answers(file) answers <- get.answers(data,N.questions) prop.correct <- answers$correct/answers$answered errors <- answers$errorbars/answers$answered # plot as a bargraph, and put a line at 50% correct barplot(prop.correct*100,names.arg=1:12,xlab="Question",ylab="% Correct",ylim=c(0,100)) abline(h=50,col="red") # plot error bars points <- seq(from=0.7,by=1.2,length.out = N.questions) for (i in 1:N.questions) lines(c(points[i],points[i]),errors[,i]*100) # put the sample size at the top text(0.7,97,paste("N =",length(data)),col="red") } get.statistics <- function(log='http://www.srcf.ucam.org/~lj237/BoP/log.txt',plot=TRUE){ # plots some basic statistics about the results of BoP so far, and produces a plot # set plot=F to disable plotting # extract the scores out of 12 suppressWarnings(numbers <- as.numeric(substr(read.answers(log),52,1000))) numbers <- numbers[!is.na(numbers)] # print some stats cat('\nResults Stats: \n\n') cat(paste(' Median:',median(numbers),'\n')) cat(paste(' Mode:',names(sort(table(numbers),decreasing=T)[1]),'\n')) cat(paste(' Variance::',var(numbers),'\n')) # produce a histogram of results, if wanted if (plot){ cat('\n Histogram plotted \n\n') title <- paste("Histogram of results. N =",length(numbers)) hist(numbers,breaks=range(numbers)[2] - range(numbers)[1],,col="grey",main=title,xlab="Results (out of 12)") } } plot.hitcounts <- function(log = 'http://www.srcf.ucam.org/~lj237/BoP/log.txt',from=13040000,xlab="Hours Post Pharyngula"){ # plots a graph of hits over time, and returns a "hist" object of that data # the "from" argument determines time 0 in number of seconds post start of 2008 # (hits before that are ignored): set to 0 to get all hits # the x axis is the number of hours since time 0 # create a vector for turning month characters into days since end of january month.names <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec") month.lengths <- c(31,28,31,30,31,30,31,31,30,31,30,31) month.cumlengths <- sapply(1:12,function(x) sum(month.lengths[1:x])) - month.lengths[1] names(month.cumlengths) <- month.names # read in the data data <- scan(log,"char",sep="\n") times <- data[(3.5:((length(data)-1)/2))*2] # get time counts suppressWarnings({ seconds <- as.numeric(substr(times,24,25)) minutes <- as.numeric(substr(times,21,22)) hours <- as.numeric(substr(times,18,19)) day <- as.numeric(substr(times,6,7)) month <- substr(times,9,11)}) # combine time <- (((day + month.cumlengths[month])*24 + hours)*60 + minutes)*60 + seconds # only take times post "from" time <- time - from time <- time[time >= 0] # plot the histogram hist.data <- hist(time/(60*60),main="Hits over time",ylab="Hits per hour",xlab=xlab) # return the histogram data return(hist.data) } # set the log url as a character string logurl <- "http://www.srcf.ucam.org/~lj237/BoP/log.txt"