peeling the onion: plotting only the inner 95% of 1000 cumulative frequency curves

gianmarco

TS Contributor
#1
Hello,
I am playing around with the function below, which (using the dataset appended further below) produces the attached plot.
The functions requires the rgeos and spatstat packages. I put some comments to the code to explain what it is actually going on.

To get the plot use: advNNa(springs)

Now, as you can se from the chart, 1000 cumulative curves are plotted (in gray), creating a sort of envelope for the black cumulative curve based on the observed data (distance of each point to its nearest neighbor). Plotting 1000 curves and stacking them on top of each other is not so computationallly sensible. I was wondering how I can 'peel out' the envelope in order to just plot the inner 95%, i.e. dropping the top and the bottom 5% of the gray curves.

C-like:
advNNa <- function (feature, studyplot=NULL, buffer=0, B=1000, order=1) {
  if(is.null(studyplot)==TRUE){
    ch <- rgeos::gConvexHull(feature)
    region <- rgeos::gBuffer(ch, width=buffer)
  } else {
    region <- studyplot
  }
  dst <- spatstat::nndist(coordinates(feature), k=order)                                 #for each point in the input feature dataset, calculate the distance to its nearest neighbor
  dst.ecdf <- ecdf(dst)                                                                  #calculate the ECDF of the observed NN distances
  dist.rnd.mtrx <- matrix(nrow=length(feature), ncol=B)                                  #create a matrix to store the distance of each random point to its nearest neighbor; each column correspond to a random set of points
  pb <- txtProgressBar(min = 0, max = B, style = 3)                                      # set the progress bar to be used later on within the loop
  for (i in 1:B){
    rnd <- sp::spsample(region, n=length(feature), type='random')                        #draw a random sample of points within the study region
    dist.rnd.mtrx[,i] <- spatstat::nndist(coordinates(rnd), k=order)                     #calculate the NN distances of the random points and store them in the matrix (column-wise)
    setTxtProgressBar(pb, i)                                                  
    }
  rnd.ecdf <- ecdf(dist.rnd.mtrx[,1])                                                    #calculate the ECDF for the first random dataset
  plot(rnd.ecdf, verticals=TRUE, do.points=FALSE,                                        #plot the ECDF of the first random dataset
       col="#A9A9A988", xlab="Nearest Neighbor distance (d)",
       ylab="G (d)",
       main="Refined Nearest Neighbor analysis (G function)",
       cex.main=0.95,
       xlim=c(min(min(dist.rnd.mtrx), min(dst)), max(max(dist.rnd.mtrx), max(dst))))
  for (i in 2:B){                                                                    
    rnd.ecdf <- ecdf(dist.rnd.mtrx[,i])                                                 #calculate the ECDF of the remaining random sets
    plot(rnd.ecdf,                                                                      #plot the above and add the plot to the preceding one
         verticals=TRUE,
         do.points=FALSE,
         add=TRUE, col="#A9A9A988",
         xlim=c(min(min(dist.rnd.mtrx), min(dst)), max(max(dist.rnd.mtrx), max(dst))))
  }
  plot(dst.ecdf,                                                                        #plot the ECDF of the input dataset and add it to the preceding plots
       verticals=TRUE,
       do.points=FALSE,
       add=TRUE,
       xlim=c(min(min(dist.rnd.mtrx), min(dst)), max(max(dist.rnd.mtrx), max(dst))))
}
Data:
C-like:
springs <- new("SpatialPointsDataFrame"
    , data = structure(list(OBJECTID_1 = 1:49, OBJECTID = c(5L, 6L, 9L, 11L,
12L, 13L, 14L, 15L, 17L, 18L, 19L, 20L, 22L, 23L, 24L, 25L, 27L,
29L, 30L, 32L, 33L, 35L, 36L, 37L, 41L, 49L, 50L, 52L, 55L, 56L,
57L, 59L, 61L, 62L, 63L, 65L, 66L, 68L, 70L, 71L, 72L, 73L, 75L,
79L, 82L, 83L, 84L, 85L, 87L), location = c(NA, NA, "Marsa",
"Ras il-Knejjes", "Ghallis / Bahar ic-Caghaq", "Wied ir-Rum",
"Gnien is-Sultan", "Mtahleb", "Korradino", "Marsa", "Mellieha",
"Ghajn Sfurija / Rdum il-Pellegrin", "Mtahleb", "Gebel Ciantar",
"Selmun", "Mdina / Mtarfa", "Mizieb ir-Rih, south of Mellieha",
"Tas-Santi", "Kalkara, near Mistra Village", "Mtahleb", "Girgenti",
"San Gorg tal-Ghadir, B'buga", "West of Rabat", "West of Mdina",
"Marsaxlokk", "Gebel Ciantar", "West of Rabat", "Zebbiegh / Torri Falka",
"Bingemma", "Marsa", "Tal-Vecca, St. Paul's Bay", "Near Burmarrad",
"Wardija (?)", "Fiddien", "Burmarrad", "Dellimara / Marnisi",
"Mgarr", "West of Pwales", "Mgarr", "Mtahleb / Wied ir-Rum",
"Gnien is-Sultan, Rabat", "Ghajn Tuffieha", "Mellieha (Tal-Fkieren)",
"Cliff end of Ghajn Zejtuna", "Ghajn Zejtuna", "East of Mellieha",
"West of Mellieha", "Mgarr", "Marsaskala"), comment = c("no correspondence in Grima's Appendix 3",
"no correspondence in Grima's Appendix 3", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), toponym = c("Ghajn Kaijet",
"Ghajn Clieb", "Ghajn, tal-", "Ghajn Bierda", "Ghajn ta' Bir il-Bahar",
"Ghajn Btejtes", "Ghajn Cirani", "Ghajn Corr", "Ghajn Dwieli",
"Ghajn Filep/Qortin/Ghajn\nFormag", "Ghajn tal-Fkieren", "Ghajn Gifra",
"Ghajn Ghorab", "Ghajn Ghulem Alla", "Ghajn Hadid", "Ghajn Hamiem",
"Ghajn Hommed", "Ghajn tal-Kalkara", "Ghajn tal-Kalkara", "Ghajn il-Kbira",
"Ghajn il-Kbira", "Ghajn Kittien", "Ghajn il-Klieb", "Ghajn Kullija (Qollija)",
"Ghajn ta' Marsaxlokk, tal-", "Ghajn Qadi", "Ghajn Qajjied",
"Ghajn il-Qasab", "Ghajn Qattus, ta\u0092", "Ghajn Rabib", "Ghajn Razun (Rasul)",
"Ghajn Rihana", "Ghajn Saliba, ta\u0092", "Ghajn ta\u0092 San Pawl",
"Ghajn Selmet", "Ghajn Sender", "Ghajn Sfurija", "Ghajn Stas",
"Ghajn Targa", "Ghajn Tejtes", "Ghajn Tewzien", "Ghajn Tuffieha",
"Ghajn Tuta", "Ghajn Xajxa", "Ghajn Xorok, ta\u0092", "Ghajn Zejtuna",
"Ghajn Znuber, ta\u0092", "Tal-Ghajn", "Wied il-Ghajn"), Island = c("Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta", "Malta",
"Malta", "Malta", "Malta", "Malta", "Malta", "Malta")), .Names = c("OBJECTID_1",
"OBJECTID", "location", "comment", "toponym", "Island"), row.names = c(NA,
-49L), class = "data.frame")
    , coords.nrs = numeric(0)
    , coords = structure(c(444679.6213, 444489.1209, 453000, 440500, 450000,
442000, 445000, 441500, 455898.8655, 454000, 444000, 441000,
442000, 447000, 444689.442, 446000, 443000, 442000, 444500, 441500,
446600, 458000, 444500, 446000, 459000, 447500, 444800, 445000,
444569.3845, 453500, 445000, 447578.0522, 444000, 444000, 446500,
458000, 442500, 443932.5311, 442000, 441800, 444500, 441481.4156,
444000, 443500, 443000, 443131.7628, 440725.362, 443142.0498,
460307.5152, 3971824.5466, 3971673.0724, 3970000, 3973500, 3977500,
3970500, 3972000, 3971000, 3970643.9336, 3971000, 3979000, 3975000,
3970000, 3966500, 3980527.5167, 3972000, 3979000, 3974000, 3979100,
3970500, 3968200, 3966000, 3971600, 3971800, 3967000, 3967000,
3971700, 3975000, 3974490.602, 3971000, 3978200, 3976186.4681,
3976500, 3972000, 3977500, 3966500, 3975500, 3977485.4479, 3975500,
3970800, 3972000, 3976353.484, 3979000, 3980500, 3980000, 3980715.9004,
3978540.4813, 3975342.6171, 3969478.1718), .Dim = c(49L, 2L), .Dimnames = list(
    NULL, c("coords.x1", "coords.x2")))
    , bbox = structure(c(440500, 3966000, 460307.5152, 3980715.9004), .Dim = c(2L,
2L), .Dimnames = list(c("coords.x1", "coords.x2"), c("min", "max"
)))
    , proj4string = new("CRS"
    , projargs = "+proj=utm +zone=33 +ellps=intl +units=m +no_defs"
)
)
 

Attachments

Last edited:

Dason

Ambassador to the humans
#2
Do you want to do this on a curve basis or at each individual x value? Because at least conceptually it's not too bad to think about how you would do it at each value of x. But if you want to do it by only pulling out 95% of the curves you would need to express a way to order the curves in some sense.
 

gianmarco

TS Contributor
#3
@Dason:
thanks for your reply. First of all, I wasn't so clear in my thread when I said about only keeping the middle 95%. In effect, I would like to just leave two curves representing the two "boundaries" (lower and upper) of the envelope.
That said, do you have any suggestion as to how to accomplish what you suggested at each x value level?
 

Dason

Ambassador to the humans
#4
Here is some sample code for generating the ecdfs for a bunch of samples and then trimming it down "pointwise". The plotting is then left as an exercise for the reader.

C-like:
#generate some fake data
mydat <- lapply(1:10000, function(i){rnorm(30)})
# Get the ecdf for each dataset
ecdfs <- lapply(mydat, ecdf)
# Get the x values you want to plot for
xs <- seq(-3, 3, by = .01)
# get the ecdf values for each of the xs
out <- lapply(seq_along(ecdfs), function(i){ecdfs[[i]](xs)})
# Put it all together into a matrix
tmp <- do.call(rbind, out)
# Get the .025 and .975 quantile for each column
# at this point each column is a fixed 'x' and the rows
# are the different ecdfs applied to that
lower <- apply(output, 2, quantile, probs = .025)
upper <- apply(output, 2, quantile, probs = .975)

# I put it all into a data.frame
ans <- data.frame(x = xs, lower95 = lower, upper95 = upper)
 

gianmarco

TS Contributor
#5
Dason:
thanks for the code. However, I get an error upon running this:
lower <- apply(out, 2, quantile, probs = .025)

All goes smooth until I hit that line. Any idea? Does it work for you?
 

gianmarco

TS Contributor
#7
Hello @Dason,
I went through the code today but I seem I have hard times to adapt it to my situation.
Would you mind to help me to sort out things?

In particular, where I am stuck is in the first part of your suggested code; what bit(s) of your code would correspond to my observed data, and which would correspond to the random points?

As for my code, the observed data (i.e., the Nearest Neighbor distances and their cumulative frequency) are computed by:
dst <- spatstat::nndist(coordinates(feature), k=order)
dst.ecdf <- ecdf(dst)


The random points (within the study plot) are obtained via a loop:
for (i in 1:B){
rnd <- sp::spsample(region, n=length(feature), type='random')
dist.rnd.mtrx[,i] <- spatstat::nndist(coordinates(rnd), k=order)
}



Referring to your example, do your “fake data” correspond to the set of random points in my situation, and the "xs" to the observed NN distances?

Best
Gm
 

gianmarco

TS Contributor
#9
I will try to provide more details and reproducible examples to hopefully get further help. I am going to provide link to two .rtf files; I tried to paste the two datasets here, but there seems to be a 1000 words limitation.

I have a set of distances; each value is the distance of a given point to its nearest neighbor (N=49):
https://www.dropbox.com/s/dt0utiymomasetq/dst.rtf?dl=0

Using a loop, I got the following matrix (49 rows by 10 columns); each column represents a random sample of 49 points and the values are again the distance of each point to its nearest neighbor:
https://www.dropbox.com/s/txx6od6m787sbus/dist.rnd.matrx.rtf?dl=0

What I am after is to plot the cumulative frequency curve of dst along with two cumulative frequency curves representing a sort of confidence region based on dist.rnd.matrx. In other words, the data from the matrix should serve to plot a sort of envelope representing the inner 95% of all the cumulative freq curves based on the matrix.

I got a reply from @Dason, but I couldn't adapt his suggestions to my context. Maybe the above pieces of data will prove useful to help me finding a viable solution to my issue.

Thanks
Gm
 
Last edited:

Dason

Ambassador to the humans
#10
I guess relentless nagging does get you your way eventually :p
Code:
library(rgeos)
library(spatstat)
library(sp)

advNNa <- function (feature, studyplot=NULL, buffer=0, B=1000, order=1) {

    if(is.null(studyplot)==TRUE){
        ch <- rgeos::gConvexHull(feature)
        region <- rgeos::gBuffer(ch, width=buffer)
    } else {
        region <- studyplot
    }
    dst <- spatstat::nndist(coordinates(feature), k=order)                                 #for each point in the input feature dataset, calculate the distance to its nearest neighbor
    dst.ecdf <- ecdf(dst)                                                                  #calculate the ECDF of the observed NN distances
    dist.rnd.mtrx <- matrix(nrow=length(feature), ncol=B)                                  #create a matrix to store the distance of each random point to its nearest neighbor; each column correspond to a random set of points
    pb <- txtProgressBar(min = 0, max = B, style = 3)       

    for (i in 1:B){
        rnd <- sp::spsample(region, n=length(feature), type='random')                        #draw a random sample of points within the study region
        dist.rnd.mtrx[,i] <- spatstat::nndist(coordinates(rnd), k=order)                     #calculate the NN distances of the random points and store them in the matrix (column-wise)
        setTxtProgressBar(pb, i)                                                  
    }
    

    # Make a list for the ecdfs
    rnd.ecdfs <- list()
    for(i in 1:ncol(dist.rnd.mtrx)){
        rnd.ecdfs[[i]] <- ecdf(dist.rnd.mtrx[,i])
    }
    
    # Why didn't they just precompute and use this as xlim in all of their
    # calls in the first place? So much code duplication...
    xlim = c(min(min(dist.rnd.mtrx), min(dst)), max(max(dist.rnd.mtrx), max(dst)))
    
    # We will evaluate the ecdfs on a grid of 1000 points between
    # the x limits
    xs <- seq(xlim[1], xlim[2], length.out = 1000)
    # This actually gets those evaluations and puts them into a matrix
    out <- lapply(seq_along(rnd.ecdfs), function(i){rnd.ecdfs[[i]](xs)})
    tmp <- do.call(rbind, out)
    
    # Get the .025 and .975 quantile for each column
    # at this point each column is a fixed 'x' and the rows
    # are the different ecdfs applied to that
    lower <- apply(tmp, 2, quantile, probs = .025)
    upper <- apply(tmp, 2, quantile, probs = .975)
    
    
    # Plot the original data
    plot(dst.ecdf, verticals=TRUE, do.points=FALSE,                                        #plot the ECDF of the first random dataset
         #col="#A9A9A988", 
         xlab="Nearest Neighbor distance (d)",
         ylab="G (d)",
         main="Refined Nearest Neighbor analysis (G function)",
         cex.main=0.95,
         xlim= xlim)
    
    # Add in the quantiles.
    lines(xs, lower, col = "red")
    lines(xs, upper, col = "red")
    
}

    
# it was too long with the springs definition in there so you'll just have to read it in yourself.
advNNa(springs)
 

hlsmith

Omega Contributor
#11
IMHO, the graph would also benefit from some transparency in the lines. How do I know the individual shapes with a ominous gray mass. It could be each one is perfectly stack on top of each other like spoons or there could be a lot of crisscrossing of lines with a stronger density in certain areas. This may not be what you are looking for but would help me as a reader.

Also, I did not read any of the posts, but how do you define inner 95% since lines crisscross or wriggle?
 

gianmarco

TS Contributor
#12
@Dason:
thanks indeed for your help, which has been much appreciated.
As a token of my good will in trying to solve my problems by myself (at the best of my R programming skills), I am pasting below a sligthly modified version of the function, in which I added the expected CFD and, instead of having two confidence bands, I managed to plot a sort of confidence "polygon".
C-like:
advNNa <- function (feature, studyplot=NULL, buffer=0, B=1000, order=1) {
  if(is.null(studyplot)==TRUE){
    ch <- rgeos::gConvexHull(feature)
    region <- rgeos::gBuffer(ch, width=buffer)
  } else {
    region <- studyplot
  }
  dst <- spatstat::nndist(coordinates(feature), k=order)                                 #for each point in the input feature dataset, calculate the distance to its nearest neighbor
  dst.ecdf <- ecdf(dst)                                                                  #calculate the ECDF of the observed NN distances
  dist.rnd.mtrx <- matrix(nrow=length(feature), ncol=B)                                  #create a matrix to store the distance of each random point to its nearest neighbor; each column correspond to a random set of points
  pb <- txtProgressBar(min = 0, max = B, style = 3)      

  for (i in 1:B){
    rnd <- sp::spsample(region, n=length(feature), type='random')                        #draw a random sample of points within the study region
    dist.rnd.mtrx[,i] <- spatstat::nndist(coordinates(rnd), k=order)                     #calculate the NN distances of the random points and store them in the matrix (column-wise)
    setTxtProgressBar(pb, i)                                                
  }

  # Make a list for the ecdfs
  rnd.ecdfs <- list()
  for(i in 1:ncol(dist.rnd.mtrx)){
    rnd.ecdfs[[i]] <- ecdf(dist.rnd.mtrx[,i])
  }

  # Why didn't they just precompute and use this as xlim in all of their
  # calls in the first place? So much code duplication...
  xlim = c(min(min(dist.rnd.mtrx), min(dst)), max(max(dist.rnd.mtrx), max(dst)))

  # We will evaluate the ecdfs on a grid of 1000 points between
  # the x limits
  xs <- seq(xlim[1], xlim[2], length.out = 1000)
  # This actually gets those evaluations and puts them into a matrix
  out <- lapply(seq_along(rnd.ecdfs), function(i){rnd.ecdfs[[i]](xs)})
  tmp <- do.call(rbind, out)

  # Get the .025 and .975 quantile for each column
  # at this point each column is a fixed 'x' and the rows
  # are the different ecdfs applied to that
  lower <- apply(tmp, 2, quantile, probs = .025)
  upper <- apply(tmp, 2, quantile, probs = .975)

  # Calculate the expected distribution
  d <- 0:max(dst)
  lambda <- length(feature) / gArea(region)
  E <- 1 - exp(-1 * lambda * pi * d^2)

  # Plot the original data
  plot(dst.ecdf, verticals=TRUE, do.points=FALSE,                                        #plot the ECDF of the first random dataset
       #col="#0000FF",
       xlab="Nearest Neighbor distance (d)",
       ylab="G (d)",
       main="Refined Nearest Neighbor analysis (G function)",
       cex.main=0.95,
       xlim= xlim)

  # Add in the quantiles.
  polygon(c(xs,rev(xs)), c(upper, rev(lower)), col = "#DBDBDB88", border = NA)
  lines(d, E, col="red")
}
But I ran into an additional (possibly easy-to-solve) issue, on which I would like to have some guidance.
As you can see from the attached chart, the black cumulative curve is plotted first, and ends up being underneath the confidence envelope. Is there a way to plot the balck curve on top of the polygon?
 

Attachments