rsvnlog.R

#!/bin/env Rscript

### to fetch the data directlry from the svn 
# system( "svn log -v https://svn.r-project.org/R > rsvn.log" )
# 
### or download it from my website
# download.file( 
#   "http://addictedtor.free.fr/misc/R50000/rsvn.log", 
#   dest = "rsvn.log" )

#' get the extension of the files if they have one or "" otherwise
getExtension <- function( files ){
    out <- character( length(files) )
    has.ext <- grepl( "[.]", basename(files) )
    out[ has.ext ] <- sub( "^.*\\.(.*?)$", "\\1", files[has.ext], perl = T )
    out
}

#' process one svn log item, something like: 
# ------------------------------------------------------------------------
# r50000 | ripley | 2009-10-09 10:34:17 +0200 (Fri, 09 Oct 2009) | 1 line
# Changed paths:
#    M /branches/R-2-10-branch/src/library/stats/R/plot.lm.R
# 
# port r49999 from trunk
process_chunk <- function( txt ){
    if( length( txt ) == 1L ) return( NULL )
    
    header_line <- strsplit( txt[2L], " | ", fixed=TRUE )[[1]][ c(1L, 2L, 3L) ]
    
    # revision number
    revision <- substring( header_line[1], 2 )
    
    # who commited the revision
    author <- header_line[2]
    
    # these are uninteresting automatic commits (only data-stamp is involved)
    if( author %in% c("apache", "root" ) ) return(NULL)
    
    date     <- substring( header_line[3], 1, 10 )
    
    # process lines associated with files
    file_lines <- txt[ seq.int( 4L, which( txt == "" )[1L] - 1L ) ]
    file_lines <- strsplit( sub( "^ +", "", file_lines), " " )
    
    # M, A, D, ...
    actions <- sapply( file_lines, "[", 1L )
    
    # the files
    files   <- sapply( file_lines, "[", 2L )
    
    # we format the result as a matrix
    # this is much more efficient than formatting as a data frame
    # at that stage
    nlines  <- length( actions )
    matrix( c( 
        rep.int( revision, nlines ), 
        rep.int( author, nlines ), 
        rep.int( date, nlines ), 
        actions, 
        files ), nrow = nlines )
    
}

# bind all results
data <- local( {
    lines <- readLines( "rsvn.log" )
    index <- cumsum( grepl( "^-+$", lines ) )
    commits <- split( lines, index )
    do.call( rbind, lapply( commits, process_chunk ) )
} )
colnames( data ) <- c( "revision", "author", "date", "action", "file" )

# extract some more information
data <- within( as.data.frame( data, stringsAsFactors = FALSE ), {
    date <- as.Date( date, format = "%Y-%m-%d") 
    revision <- as.integer( revision )
    extension <- getExtension( file )
    weekday  <- weekdays( date )
    year <- as.integer( format( date, "%Y" ) )
    month <- month.abb[ as.integer( format( date, "%m") ) ]
    
    # deal with name switchers (thanks dirk)
    author[ author == "thomas" ] <- "tlumley"
    author[ author == "paul" ] <- "murrell"
    author[ author == "martyn" ] <- "plummer"
    
} )
data <- subset( data, date > as.Date("1997-01-01") )

# a simpler data set, with only one entry per commit
simple <- data[ 
    !duplicated(data$revision), 
    c("revision","author","date", "month", "year", "weekday" ) ]

# aggregate the data daily
daydata <- local( {
    tab <- table( simple$date )
    within( data.frame( date = as.Date( names(tab) ), commits = as.integer(tab) ), {
        month <- as.integer( format( date, "%m") )
        weekday  <- weekdays( date )
        year <- as.integer( format( date, "%Y" ) )
    } )
} )

# aggregate data by day and author
day_author_data <- with( simple, aggregate( revision, 
    list( date = date, author = author ), length ) )
names( day_author_data )[3] <- "commits"
day_author_data$date <- as.Date( day_author_data$date )
day_author_data$month <- as.integer(format( day_author_data$date, "%m" ) )
day_author_data$year <- as.integer(format( day_author_data$date, "%Y" ) )

# and monthly
monthdata <- local({
    ym <- sprintf( "%04d-%02d", daydata$year, daydata$month )
    rx <- "^(\\d+)-(\\d+)$"
    ag <- aggregate( daydata$commits, list(date = ym), sum )
    names(ag)[2] <- "commits"
    within( ag, {
        year  <- sub( rx, "\\1", date )
        month <- sub( rx, "\\2", date )
        date  <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" ) 
    } )
})

month_author_data <- local({
    ym <- sprintf( "%04d-%02d", day_author_data$year, day_author_data$month )
    rx <- "^(\\d+)-(\\d+)$"
    ag <- aggregate( day_author_data$commits, list(date = ym, author = day_author_data$author), sum )
    names(ag)[3] <- "commits"
    within( ag, {
        year  <- sub( rx, "\\1", date )
        month <- sub( rx, "\\2", date )
        date  <- as.Date( paste( date, "01", sep = "-" ), format = "%Y-%m-%d" ) 
    } )
})


# calculate the number of files in the distribution at each revision
# (sum of "A"dded files so far minus sum of "D"eleted files so far)
nfiles_data <- data.frame( 
    revision = sort( unique(data$revision ) ),
    nfiles   = cumsum( tapply( data$action, data$revision, function(x){ 
        sum( x == "A", na.rm = TRUE) - sum(x == "D", na.rm = TRUE) 
    } ) ), 
    date     = as.Date( tapply( as.character(data$date), data$revision, head, 1 ) ), 
    author = tapply( data$author, data$revision, head, 1 ) )