getExtension <- function( files ){
out <- character( length(files) )
has.ext <- grepl( "[.]", basename(files) )
out[ has.ext ] <- sub( "^.*\\.(.*?)$", "\\1", files[has.ext], perl = T )
out
}
process_chunk <- function( txt ){
if( length( txt ) == 1L ) return( NULL )
header_line <- strsplit( txt[2L], " | ", fixed=TRUE )[[1]][ c(1L, 2L, 3L) ]
revision <- substring( header_line[1], 2 )
author <- header_line[2]
if( author %in% c("apache", "root" ) ) return(NULL)
date <- substring( header_line[3], 1, 10 )
file_lines <- txt[ seq.int( 4L, which( txt == "" )[1L] - 1L ) ]
file_lines <- strsplit( sub( "^ +", "", file_lines), " " )
actions <- sapply( file_lines, "[", 1L )
files <- sapply( file_lines, "[", 2L )
nlines <- length( actions )
matrix( c(
rep.int( revision, nlines ),
rep.int( author, nlines ),
rep.int( date, nlines ),
actions,
files ), nrow = nlines )
}
data <- local( {
lines <- readLines( "rsvn.log" )
index <- cumsum( grepl( "^-+$", lines ) )
commits <- split( lines, index )
do.call( rbind, lapply( commits, process_chunk ) )
} )
colnames( data ) <- c( "revision", "author", "date", "action", "file" )
data <- within( as.data.frame( data, stringsAsFactors = FALSE ), {
date <- as.Date( date, format = "%Y-%m-%d")
revision <- as.integer( revision )
extension <- getExtension( file )
weekday <- weekdays( date )
year <- as.integer( format( date, "%Y" ) )
month <- month.abb[ as.integer( format( date, "%m") ) ]
author[ author == "thomas" ] <- "tlumley"
author[ author == "paul" ] <- "murrell"
author[ author == "martyn" ] <- "plummer"
} )
data <- subset( data, date > as.Date("1997-01-01") )
simple <- data[
!duplicated(data$revision),
c("revision","author","date", "month", "year", "weekday" ) ]
daydata <- local( {
tab <- table( simple$date )
within( data.frame( date = as.Date( names(tab) ), commits = as.integer(tab) ), {
month <- as.integer( format( date, "%m") )
weekday <- weekdays( date )
year <- as.integer( format( date, "%Y" ) )
} )
} )
day_author_data <- with( simple, aggregate( revision,
list( date = date, author = author ), length ) )
names( day_author_data )[3] <- "commits"
day_author_data$date <- as.Date( day_author_data$date )
day_author_data$month <- as.integer(format( day_author_data$date, "%m" ) )
day_author_data$year <- as.integer(format( day_author_data$date, "%Y" ) )
monthdata <- local({
ym <- sprintf( "%04d-%02d", daydata$year, daydata$month )
rx <- "^(\\d+)-(\\d+)$"
ag <- aggregate( daydata$commits, list(date = ym), sum )
names(ag)[2] <- "commits"
within( ag, {
year <- sub( rx, "\\1", date )
month <- sub( rx, "\\2", date )
date <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" )
} )
})
month_author_data <- local({
ym <- sprintf( "%04d-%02d", day_author_data$year, day_author_data$month )
rx <- "^(\\d+)-(\\d+)$"
ag <- aggregate( day_author_data$commits, list(date = ym, author = day_author_data$author), sum )
names(ag)[3] <- "commits"
within( ag, {
year <- sub( rx, "\\1", date )
month <- sub( rx, "\\2", date )
date <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" )
} )
})
nfiles_data <- data.frame(
revision = sort( unique(data$revision ) ),
nfiles = cumsum( tapply( data$action, data$revision, function(x){
sum( x == "A", na.rm = TRUE) - sum(x == "D", na.rm = TRUE)
} ) ),
date = as.Date( tapply( as.character(data$date), data$revision, head, 1 ) ),
author = tapply( data$author, data$revision, head, 1 ) )