forked from brendano/dlanalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 73fe52a
Showing
4 changed files
with
279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require 'rubygems' | ||
ENV['XMAS_ENV'] = "production" | ||
require 'xmas' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
util$mypairs <- function() { | ||
# crazy merge of corrgram::corrgram and psych::pairs.panels | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# generic analysis functions | ||
|
||
source("~/dlanalysis/util.R") | ||
|
||
dlanalysis = new.env() | ||
|
||
agg_to_unit <- function(a, by='unit_id', | ||
up_for_vote = names(a)[bgrep("^X\\.",names(a)) & !bgrep("^X\\.amt",names(a))] | ||
) { | ||
|
||
u = dfagg(a, a[,by], function(x) { | ||
ret = list() | ||
for(attr in up_for_vote) { | ||
# winner = names(which.max(table( x[,attr] ))) | ||
# ret[[paste(attr,'decision',sep='_')]] = winner | ||
p = table(x[,attr]) / nrow(x) | ||
ret[[paste(attr,'entropy',sep='_')]] = sum(-p * log(p), na.rm=T) | ||
ret = c(ret, table(x[,attr])) | ||
} | ||
ret | ||
}) | ||
u | ||
} | ||
|
||
binary_vote <- function(u, thresh=0.5, target='same', total_from=c('same','diff'), bool=FALSE) { | ||
bool_vote = u[,target] / apply(u[,total_from], 1, sum) > thresh | ||
if (bool) return (bool_vote) | ||
fill_bool(bool_vote, target, setdiff(total_from, target)[1]) | ||
} | ||
|
||
|
||
# w = data.frame(row.names=names(sort(-table(a$X.amt_worker_ids)))) | ||
# w$num = table(a$X.amt_worker_ids)[row.names(w)] | ||
# list(w=w, u=u) | ||
|
||
|
||
while("dlanalysis" %in% search()) | ||
detach("dlanalysis") | ||
attach(dlanalysis) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
options(showWarnCalls=T, showErrorCalls=T) | ||
if (system("stty -a &>/dev/null") == 0) | ||
options(width= as.integer(sub(".* ([0-9]+) column.*", "\\1", system("stty -a", intern=T)[1])) ) | ||
|
||
|
||
|
||
util = new.env() | ||
|
||
util$unitnorm <- function(x, ...) (x - mean(x,...)) / sd(x,...) | ||
|
||
util$msg <- function(...) cat(..., "\n", file=stderr()) | ||
|
||
util$strlen <- function(s) length(strsplit(s,"")[[1]]) | ||
|
||
util$strmatch <- function(pat,s) length(grep(pat,s)) > 0 | ||
|
||
util$strstrip <- function(s) gsub("^\\s*|\\s*$", "", s) | ||
|
||
util$unwhich <- function(indices, len=length(indices)) { | ||
ret = rep(F,len) | ||
ret[indices] = T | ||
ret | ||
} | ||
|
||
# util$merge_vec <- function(df, y, by, name) { | ||
# right = data.frame(bla=y) | ||
# right[[name]] = right$bla | ||
# rm(right$bla) | ||
# right[[by]] = as.numeric(names(y)) | ||
# merge(df, right, sort=FALSE) | ||
# } | ||
|
||
util$lax_rbind <- function(...) { | ||
inputs = list(...) | ||
each_names = sapply(inputs, names) | ||
all_names = unique(c(each_names, recursive=TRUE)) | ||
for (k in 1:length(inputs)) { | ||
if (is.null(inputs[[k]])) next | ||
more = setdiff(all_names, names(inputs[[k]])) | ||
inputs[[k]][,more] = NA | ||
} | ||
do.call(rbind, inputs) | ||
} | ||
|
||
util$fill_bool <- function(bool, true='yes', false='no') { | ||
ret = rep(NA,length(bool)) | ||
names(ret) = names(bool) | ||
ret[bool] = true | ||
ret[!bool] = false | ||
ret | ||
} | ||
|
||
# "boolean" grep: return a logical vector ready for &, | etc ops. | ||
# so bgrep works in the world of vector ops like ==, %in%, etc. | ||
|
||
util$bgrep <- function(pat,x, ...) { | ||
unwhich(grep(pat,x,...), length(x)) | ||
} | ||
|
||
util$tapply2 <- function(x, ...) { | ||
if (is.factor(x)) { | ||
r = factor(tapply(as.character(x), ...), levels=levels(x)) | ||
} else { | ||
r = tapply(x, ...) | ||
} | ||
r | ||
} | ||
|
||
util$inject <- function(collection, start, fn) { | ||
acc = start | ||
for (x in collection) | ||
acc = fn(acc, x) | ||
acc | ||
} | ||
|
||
util$select <- function(collection, fn) { | ||
r = c() | ||
for (x in collection) | ||
if (fn(x)) | ||
r = c(r, x) | ||
r | ||
} | ||
|
||
util$is_empty <- function(collection) length(collection) == 0 | ||
|
||
util$as.c <- as.character | ||
|
||
util$xprod <- function(xs,ys) { | ||
ret = list() | ||
i=0 | ||
for (x in xs) for (y in ys) { | ||
i = i+1 | ||
ret[[i]] = list(x=x,y=y) | ||
} | ||
ret | ||
} | ||
|
||
util$timeit <- function(x) { | ||
start = Sys.time() | ||
ret = eval(x) | ||
finish = Sys.time() | ||
print(finish - start) | ||
invisible(ret) | ||
} | ||
|
||
util$dotprogress <- function(callback, interval=100) { | ||
count = 0 | ||
return(function(...) { | ||
if ((count <<- count+1) %% interval == 0) | ||
cat(".") | ||
callback(...) | ||
}) | ||
} | ||
|
||
# dataframe-outputting apply and aggregation functions | ||
|
||
# like sapply/lapply except it expects fn() to yield lists. | ||
# each list gets coerced into a single row of a dataframe. | ||
|
||
util$dfapply <- function(collection, fn, t=F) { | ||
r = sapply(collection, fn) | ||
if (t) r = base::t(r) | ||
r = matrix2df(r) | ||
r | ||
} | ||
|
||
# sapply() with fn() yielding lists retrns a matrix with named rows/cols ... | ||
# and whenever you name-index into this thing it return a list ... yuck | ||
# make that shit more normal. | ||
|
||
util$matrix2df <- function(x) { | ||
if (class(x) != 'matrix') stop("why is class ",class(x)) | ||
colnames = names(x[1,]) | ||
data.frame( | ||
sapply(colnames, function(n) unlist(x[,n])), | ||
row.names=row.names(x)) | ||
} | ||
|
||
|
||
# like by() but the data types are less crazy: | ||
# if fn() returns a list, a data frame is returned. | ||
# -> byvals are the row names. | ||
# -> each list is coerced into the rows. | ||
# if fn() returns a nonlist, a list is returned. | ||
# -> byvals are the names. | ||
# We attempt to be tolerant for slight inconsistencies in fn()'s return values. | ||
|
||
util$dfagg <- function(d, byvals, fn) { | ||
if (class(byvals) == 'function') | ||
byvals = byvals(d) | ||
|
||
b = by(d, byvals, fn) | ||
|
||
cols = NULL | ||
for (i in 1:min(100,length(b))) { | ||
cols = c(cols, names(b[[i]])) | ||
} | ||
cols = unique(cols) | ||
|
||
ret = data.frame(row.names=names(b)) | ||
|
||
for (col in cols) { | ||
ret[,col] = sapply(names(b), function(k) b[[k]][[col]]) | ||
} | ||
if(length(cols) == 0) { | ||
return(sapply(names(b), function(k) b[[k]])) | ||
} | ||
ret | ||
} | ||
|
||
util$mymerge <- function(x,y, row.x=F,row.y=F, by=NULL, ...) { | ||
if (row.x) { | ||
x[,by] = row.names(x) | ||
} | ||
if (row.y) { | ||
y[,by] = row.names(y) | ||
} | ||
|
||
merge(x,y,by=by, ...) | ||
|
||
} | ||
|
||
util$read.xmlss <- function(f) { | ||
## BUG: the xml skips cells sometimes. tricky to parse, argh | ||
# Mac Excel 2004 calls this "XML Spreadsheet". It's nice because it's UTF-8. | ||
# [ mac .xls seems to be macroman, but xls2csv (perl converter) f's it up,. | ||
# and then iconv can't recover. boo! ] | ||
csv_pipe = pipe(paste('ruby <<EOF | ||
require "rubygems" | ||
require "hpricot" | ||
require "fastercsv" | ||
h = Hpricot(File.read("',f,'")) | ||
mat = (h.at("worksheet")/"row").map{|row| (row/"cell").map{|data| data.inner_text}} | ||
mat.each{|row| puts row.to_csv} | ||
', sep='')) | ||
df = read.csv(csv_pipe) | ||
# close(csv_pipe) | ||
df | ||
} | ||
|
||
util$rbern <- function(n, p=0.5) rbinom(n, size=1, prob=p) | ||
|
||
######## | ||
|
||
# for interactivity... | ||
|
||
util$excel <- function(d) { | ||
con = file("/tmp/tmp.csv", "w", encoding="MACROMAN") | ||
write.csv(d, con) | ||
system("open -a 'Microsoft Excel' /tmp/tmp.csv") | ||
close(con) | ||
} | ||
|
||
util$mate <- function(...) { | ||
system(paste("mate", ...)) | ||
} | ||
|
||
# pretty-print as yaml. intended for rows with big textual cells. | ||
|
||
util$ppy <- function(x, column.major=FALSE) { | ||
library(yaml) | ||
cat(as.yaml(x, column.major=column.major)) | ||
cat("\n") | ||
} | ||
|
||
|
||
while("util" %in% search()) | ||
detach("util") | ||
attach(util) |