Data preprocessing

Loading the raw dataset from the package1.

library(netdiffuseR)
data(medInnovations)

Now that we have the data in R, we can start working with it, in particular, we want to do the following things:

# Creating unique ids (including for the network data)
othervars <- c("id", "toa", "city")
netvars <- names(medInnovations)[grepl("^net", names(medInnovations))]
for (i in c("id", netvars))
  medInnovations[[i]] <- medInnovations[[i]] + medInnovations$city*1000

# Leaving unsurveyed individuals with NA
surveyed <- medInnovations$id
for (i in netvars)
  medInnovations[[i]][which(!(medInnovations[[i]] %in% surveyed))] <- NA

# Adding autoedges to farmers that are isolated, we need to do this otherwize
# these will be dropped when calling the function -edgelist_to_adjmat-. Notice
# that this does not imply that the graph will have autoedges. (see manual)
isolated <- which(apply(medInnovations[, netvars], 1, function(x) all(is.na(x))))
medInnovations[isolated, netvars[1]] <- medInnovations$id[isolated]

# Reshaping data (so we have an edgelist)
medInnovations.long <- reshape(
  medInnovations[,c(othervars, netvars)], v.names= "net",
  varying = netvars,
  timevar = "level", idvar="id", direction="long")

Once we have the data in long format, we can coerce it into an diffnet object. This is done by reading the edgelist, obtaining the times of adoption vector and applying the as_diffnet function.

# Coersing the edgelist to an adjacency matrix. Here we are assuming that the
# network is constant through time.
graph <- with(
  medInnovations.long,
  edgelist_to_adjmat(cbind(id, net), t=18,undirected=FALSE, use.incomplete=FALSE)
)
## Warning in edgelist_to_adjmat.matrix(edgelist, weights, times, t, simplify, : Some vertices had NA/NULL values:
##  1, 5, 18, 28, 29, 41, 43, 51, 52, 54, 59, 61, 65, 66, 67, 68, 76, 82, 83, 85, ...
## The complete list will be stored as an attribute of the resulting adjacency matrix, namely, -incomplete-.
# Here we are retrieving the set of individuals who actually were used in the
# network (as these are not isolated nodes)
used.vertex <- rownames(graph[[1]])
medInnovations <- subset(medInnovations, id %in% used.vertex)

# Create the vector (subset) of times of adoption using only the individuals
# that are included in the adjacency matrix
toa <- medInnovations$toa

# Creating a diffnet object
diffnet <- as_diffnet(graph, toa, vertex.static.attrs = medInnovations)
## Warning in as_diffnet(graph, toa, vertex.static.attrs = medInnovations):
## Coercing -toa- into integer.

Methods

Once a diffnet object, we can apply the usual generic R functions:

plot(diffnet, t=diffnet$meta$nper)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve

diffnet
## Dynamic network of class -diffnet-
##  # of nodes        : 125
##  # of time periods : 18
##  Final prevalence  : 1.00
##  Type              : directed
summary(diffnet)
## Diffusion network summary statistics
## -----------------------------------------------------------------------
##  Period  Adopters Cum Adopt. Cum Adopt. % Hazard Rate Density Moran's I 
## -------- -------- ---------- ------------ ----------- ------- --------- 
##        1       11         11         0.09           -    0.02      0.05 
##        2        9         20         0.16        0.08    0.02      0.02 
##        3        9         29         0.23        0.09    0.02     -0.03 
##        4       11         40         0.32        0.11    0.02     -0.02 
##        5       11         51         0.41        0.13    0.02     -0.04 
##        6       11         62         0.50        0.15    0.02     -0.02 
##        7       13         75         0.60        0.21    0.02     -0.01 
##        8        7         82         0.66        0.14    0.02      0.00 
##        9        4         86         0.69        0.09    0.02     -0.00 
##       10        1         87         0.70        0.03    0.02      0.00 
##       11        5         92         0.74        0.13    0.02      0.01 
##       12        3         95         0.76        0.09    0.02      0.01 
##       13        3         98         0.78        0.10    0.02      0.01 
##       14        4        102         0.82        0.15    0.02      0.03 
##       15        4        106         0.85        0.17    0.02      0.04 
##       16        2        108         0.86        0.11    0.02      0.03 
##       17        1        109         0.87        0.06    0.02      0.01 
##       18       16        125         1.00        1.00    0.02      0.00 
## -----------------------------------------------------------------------
##  Left censoring  : 0.09 (11)
##  Right centoring : 0.00 (0)
##  # of nodes      : 125

And the ones included in the package:

plot_diffnet(diffnet, slices=c(1,4,8,12,16,18))

plot_threshold(diffnet, undirected = FALSE, vertex.cex = 1/5)

plot_adopters(diffnet)

plot_hazard(diffnet)

Statistical test

Now, we want to know if the threshold model fits here. In order to do so we will use the structure dependency test built in the package, boot_net. We will apply this both in a aggregated level and by city. First we need to subset the data:

# Get cities ids so we can subset the vertices and run the test by city.
city <- diffnet$vertex.static.attrs[,"city"]

# Subsetting diffnet, notice that we can use either indices or ids to create a
# "subdiffnet". In this case we are using indices.
diffnet_city1 <- diffnet[which(city==1),]
diffnet_city2 <- diffnet[which(city==2),]
diffnet_city3 <- diffnet[which(city==3),]
diffnet_city4 <- diffnet[which(city==4),]

Notice that by subsetting the set of vertices we have created 4 new diffnet objects, so all the methods and functions work for each one of these, for example, threshold levels in each city

oldpar <- par(no.readonly = TRUE)
par(mfrow=c(2,2))
plot_threshold(diffnet_city1, vertex.label = "", main="Threshold and ToA\nin City 1")
plot_threshold(diffnet_city2, vertex.label = "", main="Threshold and ToA\nin City 2")
plot_threshold(diffnet_city3, vertex.label = "", main="Threshold and ToA\nin City 3")
plot_threshold(diffnet_city4, vertex.label = "", main="Threshold and ToA\nin City 4")

par(oldpar)
plot_infectsuscep(diffnet_city1, K=5, logscale = TRUE, bins=20)
## Warning in plot_infectsuscep.list(graph$graph, graph$toa, t0, normalize, :
## When applying logscale some observations are missing.

Now we run the test for each city. Observe that we can use the parallel package to speedup the test as we will do in the first two cities using two cores (this is done thanks to the boot package).

# Defining the statistic
avgthr <- function(x) mean(threshold(x), na.rm = TRUE)

# Running the test by city
test1   <- boot_net(diffnet_city1, avgthr, 2000, ncpus=2, parallel="multicore")
test2   <- boot_net(diffnet_city2, avgthr, 2000, ncpus=2, parallel="multicore")
test3   <- boot_net(diffnet_city3, avgthr, 2000)
test4   <- boot_net(diffnet_city4, avgthr, 2000)

# Running the test aggregated
testall <- boot_net(diffnet, avgthr, 2000, ncpus=2, parallel="multicore")

# Printing the outcomes
test1
## Network Rewiring graph (2000 simulations)
## # nodes           : 62
## # of time periods : 18
## --------------------------------------------------------------------------------
##  H0: t - t0 = 0 (No structure dependency)
##    t0 (observed) = 0.6303763
##    t (simulated) = 0.5035853
##    p-value = 0.00100
test2
## Network Rewiring graph (2000 simulations)
## # nodes           : 24
## # of time periods : 18
## --------------------------------------------------------------------------------
##  H0: t - t0 = 0 (No structure dependency)
##    t0 (observed) = 0.4930556
##    t (simulated) = 0.4611856
##    p-value = 0.63400
test3
## Network Rewiring graph (2000 simulations)
## # nodes           : 21
## # of time periods : 18
## --------------------------------------------------------------------------------
##  H0: t - t0 = 0 (No structure dependency)
##    t0 (observed) = 0.4365079
##    t (simulated) = 0.4696438
##    p-value = 0.64200
test4
## Network Rewiring graph (2000 simulations)
## # nodes           : 18
## # of time periods : 18
## --------------------------------------------------------------------------------
##  H0: t - t0 = 0 (No structure dependency)
##    t0 (observed) = 0.5925926
##    t (simulated) = 0.4899699
##    p-value = 0.12800
testall
## Network Rewiring graph (2000 simulations)
## # nodes           : 125
## # of time periods : 18
## --------------------------------------------------------------------------------
##  H0: t - t0 = 0 (No structure dependency)
##    t0 (observed) = 0.566
##    t (simulated) = 0.5010325
##    p-value = 0.01400

This shows that City 1 is the only place where threshold seems to be struture dependent, as after simulating 2,000 networks (by rewiring each one of these so all have the same number of vertices and density) the average threshold of the rewired networks and the observed network are statistically different. Now we can make an histogram of the outcomes by city:

# To make it nicer, we change the parameters in using par
# (see ?par)
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(2,2))

# Now we use the hist method for the -diffnet_boot- class
hist(test1, main="Distribution of Statistic on rewired\nnetwork (City 1)")
hist(test2, main="Distribution of Statistic on rewired\nnetwork (City 2)")
hist(test3, main="Distribution of Statistic on rewired\nnetwork (City 3)")
hist(test4, main="Distribution of Statistic on rewired\nnetwork (City 4)")

# Returning to the previous graphical parameters
par(oldpar)

Interestingly, since City 1 has the most vertices and its p-value is roughly 0.001, computing the test for the agreggate leads to a p-value of ~0.02, which can be misleading pointing that the threshold model fits the entire network.

Retrieving the data to create a panel/envent history/longitudinal data

To use the data for statistical models we can retrieve the data stored in the diffnet object and coerce it as a data.frame. First, to show the richness of the package, we will compute exposure at each time period and add it as a dynamic vertex attribute.

# Calculating exposure
expo <- exposure(diffnet)
head(expo)
##      1         2         3         4         5         6         7
## 1001 0 0.0000000 0.5000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1002 0 0.0000000 0.5000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1003 0 0.2000000 0.4000000 0.6000000 0.8000000 0.8000000 0.8000000
## 1004 0 0.3333333 0.3333333 0.3333333 0.3333333 0.6666667 0.6666667
## 1005 0 0.0000000 0.0000000 0.0000000 1.0000000 1.0000000 1.0000000
## 1006 0 0.3333333 0.6666667 0.6666667 1.0000000 1.0000000 1.0000000
##              8         9        10        11        12        13        14
## 1001 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1002 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1003 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1004 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667
## 1005 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## 1006 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##             15        16        17 18
## 1001 1.0000000 1.0000000 1.0000000  1
## 1002 1.0000000 1.0000000 1.0000000  1
## 1003 1.0000000 1.0000000 1.0000000  1
## 1004 0.6666667 0.6666667 0.6666667  1
## 1005 1.0000000 1.0000000 1.0000000  1
## 1006 1.0000000 1.0000000 1.0000000  1
# Must be a list to be used in diffnet
expo <- lapply(1:ncol(expo), function(x) cbind(netexp=expo[,x]))

# Adding it to diffnet
diffnet.attrs(diffnet, attr.class="dyn") <- expo

Now we can create a data frame from our diffnet object

mydata <- diffnet.attrs(diffnet, as.df = TRUE)

  1. Note that there is a diffnet version of the same dataset in the package, medInnovationsDiffNet.