I am not a mathematician and thus need major help representing my actions mathematically. First I'll explain what I'm doing and then a crack at writing a formula for it.
I have time stamps for two codes that look like this (call them code x and y):
And here's the structure of those two codes:
These will output a vector of distances.
Mathematically here's a whack representing it with numbers:
\(min(|y - x_i|)\)
As I write it out the 2 things I am struggling with is how to:
I can write the code and have it for R if this is useful in understanding what I'm doing though it's not really reproducible as it relies on my package I'm creating:
I have time stamps for two codes that look like this (call them code x and y):
Code:
#code x
duration start end
5 1 184 184
55 1 905 905
92 3 1811 1813
#code y
duration start end
4 2 107 108
6 2 116 117
8 2 131 132
10 1 145 145
12 4 166 169
16 2 212 213
22 5 293 296
58 2 704 705
70 2 877 878
72 2 941 942
109 2 1787 1788
121 1 1982 1982
Code:
x <- structure(list(duration = c(1L, 1L, 3L), start = c(184L, 905L,
1811L), end = c(184L, 905L, 1813L)), .Names = c("duration", "start",
"end"), class = "data.frame", row.names = c("5", "55", "92"))
y <- structure(list(duration = c(2L, 2L, 2L, 1L, 4L, 2L, 5L, 2L, 2L,
2L, 2L, 1L), start = c(107L, 116L, 131L, 145L, 166L, 212L, 293L,
704L, 877L, 941L, 1787L, 1982L), end = c(108L, 117L, 132L, 145L,
169L, 213L, 296L, 705L, 878L, 942L, 1788L, 1982L)), .Names = c("duration",
"start", "end"), class = "data.frame", row.names = c("4", "6",
"8", "10", "12", "16", "22", "58", "70", "72", "109", "121"))
- I take the first observation of x and calculate the distance to the closest occurrence of y from either end of x. (relational) Repeat for all x_i
- I take the first observation of x and calculate the distance to the closest occurrence of y from the front end of x. (causal; you believe x causes y). repeat for all x_i.
These will output a vector of distances.
Mathematically here's a whack representing it with numbers:
\(min(|y - x_i|)\)
As I write it out the 2 things I am struggling with is how to:
- Say take the minimum distance
- For the second calculation method I described above (take only from y values occurring after x_i) how to say only take the positive values.
I can write the code and have it for R if this is useful in understanding what I'm doing though it's not really reproducible as it relies on my package I'm creating:
Code:
cm_describe <- function(code, grouping.var = NULL) {
if (!is.null(grouping.var)) {
if (is.list(grouping.var)) {
m <- unlist(as.character(substitute(grouping.var))[-1])
m <- sapply(strsplit(m, "$", fixed = TRUE),
function(x) x[length(x)])
NAME <- paste(m, collapse = "&")
} else {
G <- as.character(substitute(grouping.var))
NAME <- G[length(G)]
}
cname <- strsplit(as.character(substitute(code)), "&")
NAME <- paste0(cname[[length(cname)]], "&", NAME)
group.var <- if (is.list(grouping.var) & length(grouping.var)>1) {
apply(data.frame(grouping.var), 1, function(x){
if (any(is.na(x))){
NA
} else {
paste(x, collapse = ".")
}
}
)
} else {
grouping.var
}
v <- do.call(data.frame, rle(paste2(list(code, group.var))))
} else {
v <- do.call(data.frame, rle(code))
}
v$end<- cumsum(v[, 1])
colnames(v)[1] <- "duration"
v$start <- c(0, c(v$end+1)[-c(length(v$end))])
v$center <- (v$start + v$end)/2
v2 <- v[, c("values", "center", "duration", "start", "end")]
if (!is.null(grouping.var)) {
nl <- if (is.list(grouping.var)) {
grouping.var
} else {
list(grouping.var)
}
L2 <- lapply(1:(length(nl) + 1), function(i) {
x <- strsplit(as.character(v2[, "values"]), "\\.")
sapply(1:length(x), function(j)x[[j]][i])
}
)
v3 <- data.frame(do.call(cbind, L2))
colnames(v3) <- unlist(strsplit(NAME, "\\&"))
v2 <- data.frame(v3, v2[, -1, drop=FALSE])
} else {
cname <- strsplit(as.character(substitute(code)), "&")
colnames(v2)[1] <- cname[[length(cname)]]
}
return(v2)
}
cm_bidist <- function(code_x, code_y, grouping.var = NULL) {
x <- cm_describe(code_x, grouping.var)
x <- x[as.numeric(as.character(x[, "code_x"])) > 0, ]
y <- cm_describe(code_y, grouping.var)
y <- y[as.numeric(as.character(y[, "code_y"])) > 0, ]
Dnc <- sapply(1:nrow(x), function(i) min(abs(c(y[, "start"],
y[, "end"]) - c(x[i, "start"], x[i, "end"]))))
Dc <- sapply(1:nrow(x), function(i) {
vals <- c(y[, "start"], y[, "end"]) - c(x[i, "start"], x[i, "end"])
if (sum(vals[vals >= 0]) == 0) {
return(NA) #there should be a penalty for this
} else {
min(vals[vals >= 0])
}
}
)
list(associated_distance = Dnc, mean.sd_assoc_dist = c(mean(Dnc), sd(Dnc)),
causal_distance = Dc, mean.sd_causal_dist = c(mean(na.omit(Dc)), sd(na.omit(Dc))))
}