Getting started

clean up

rm(list=ls())


general custom functions

  • fpackage.check: Check if packages are installed (and install if not) in R (source)
  • fload.R: function to load R-objects under new names.
  • fidmaker: creating mock IDs
fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

fload.R  <- function(fileName){
  load(fileName)
  get(ls()[ls() != "fileName"])
}

fidmaker <- function(x) {
    max.val = x * 1e+05
    count <- nchar(as.character(max.val))  # find out how many 'numbers' each id will have after the letter
    size <- paste("%0", count, "d", sep = "")  # set the variable to be fed into 'sprintf' to ensure we have leading 0's
    lets <- toupper(sample(letters, x, replace = T))  # randomizing the letters 
    nums <- sprintf(size, sample(1:max.val)[1:x])  # randomizing the numbers, and ensuring they all have the same number of characters
    ids <- paste(lets, nums, sep = "")  # joining them together
    return(ids)
  }


necessary packages

  • RSiena: creating RSiena data objects
  • dplyr: package for data wrangling
packages = c("RSiena", "dplyr")
fpackage.check(packages)


clubdata.RData

In the following scripts a list containing the (anonymized) data of all clubs is made (clubdata.RData).

  • Our primary network variable is the Kudos-network. A tie i -> j exists if ego i award at least 1 Kudos to alter j.

  • For the behavioral data we include information on the frequency (i.e., in times per week) and volume (i.e., in hours per week) of running activities. We included activity (frequency and volume) in other sports (e.g., cycling and swimming) as a time-varying covariate.

# club string represents the club ID
club_str <- c("clubid1", "clubid2", "clubid3", "clubid4" ,"clubid5") 

# the following script reads the data of the clubs from the folder for each club, stores them in a list, and saves it in an object in the last function call of this script. 


for (i in (1: length(club_str))) {
  
  club_id <- club_str[i]
  
  # read the data from the club
  clubdata <- read.csv(paste("clubs/", club_id, "/", "egoData_extended.csv", sep = ""), row.names = NULL,
                       sep = ",")
  # saving club size
  size <- length(clubdata[, "gender"])
  # the number of months that we want to add as waves
  n_waves <- 12
  
  # let's load the friendship/following network
  friend_data <- as.matrix(read.csv(paste("clubs/", club_id, "/", "socialnetwork.csv", sep = ""), row.names = NULL,
                                    sep = ","))
  # remove the first column (represents index made in the csv)
  friend_data <- friend_data[, 2:ncol(friend_data)]
  
  # and anonymize the user id, with our id-maker function
  
  fakeid <- fidmaker(nrow(friend_data))  # generate random id
  colnames(friend_data) <- fakeid  # anonymizing users
  
  # let's load the kudos network
  path <- paste("clubs/", club_id, "/", "kudos", sep = "")  # create path
  {
    kudo_w1 <- as.matrix(read.csv(paste(path, "1-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w1 <- kudo_w1[, 2:ncol(kudo_w1)]
    kudo_w2 <- as.matrix(read.csv(paste(path, "2-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w2 <- kudo_w2[, 2:ncol(kudo_w2)]
    kudo_w3 <- as.matrix(read.csv(paste(path, "3-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w3 <- kudo_w3[, 2:ncol(kudo_w3)]
    kudo_w4 <- as.matrix(read.csv(paste(path, "4-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w4 <- kudo_w4[, 2:ncol(kudo_w4)]
    kudo_w5 <- as.matrix(read.csv(paste(path, "5-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w5 <- kudo_w5[, 2:ncol(kudo_w5)]
    kudo_w6 <- as.matrix(read.csv(paste(path, "6-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w6 <- kudo_w6[, 2:ncol(kudo_w6)]
    kudo_w7 <- as.matrix(read.csv(paste(path, "7-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w7 <- kudo_w7[, 2:ncol(kudo_w7)]
    kudo_w8 <- as.matrix(read.csv(paste(path, "8-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w8 <- kudo_w8[, 2:ncol(kudo_w8)]
    kudo_w9 <- as.matrix(read.csv(paste(path, "9-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w9 <- kudo_w9[, 2:ncol(kudo_w9)]
    kudo_w10 <- as.matrix(read.csv(paste(path, "10-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w10 <- kudo_w10[, 2:ncol(kudo_w10)]
    kudo_w11 <- as.matrix(read.csv(paste(path, "11-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w11 <- kudo_w11[, 2:ncol(kudo_w11)]
    kudo_w12 <- as.matrix(read.csv(paste(path, "12-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w12 <- kudo_w12[, 2:ncol(kudo_w12)]
  }
  kudos <- array(c(kudo_w1, kudo_w2, kudo_w3, kudo_w4, kudo_w5, kudo_w6, kudo_w7, kudo_w8, kudo_w9,
                   kudo_w10, kudo_w11, kudo_w12), dim = c(size, size, n_waves))  #Kudos matrix
  
  kudo_data <- ifelse(kudos > 0, 1, 0)  #if at least 1 Kudo is send, tie exists
  
  # running time (in hours per week)
  time_run <- array(c(clubdata[, "time_run_1.2019"], clubdata[, "time_run_2.2019"], clubdata[, "time_run_3.2019"],
                      clubdata[, "time_run_4.2019"], clubdata[, "time_run_5.2019"], clubdata[, "time_run_6.2019"],
                      clubdata[, "time_run_7.2019"], clubdata[, "time_run_8.2019"], clubdata[, "time_run_9.2019"],
                      clubdata[, "time_run_10.2019"], clubdata[, "time_run_11.2019"], clubdata[, "time_run_12.2019"]),
                    dim = c(size, 1, n_waves))  # minutes per month
  time_run_h <- time_run/60  # hours per month
  # time_run_30 <- time_run_h * 2 # half hours
  time <- ceiling(time_run_h/4)  # per week
  
  # provide some descriptives 
  # time %>% # absolute table() time %>% # proportionate table() %>%
  # prop.table() %>% round(3) cumprop <- time %>% # cumulative proportions table() %>%
  # prop.table() %>% round(3) %>% cumsum() print(cumprop) 1-cumprop # the percentage of all
  # values that is higher than the particular value
  
  # our idea was to cap running time of at 7+ hours per week.  this resulted in rather smooth
  # (right-skewed) distribution of values for most clubs except for club 5, where 7+ hours
  # category was highly populated over time.  We added 2 extra categories (capped of at 9+
  # hours); this resulted in a rather smooth distribution for this club.
  
  if (club_id == "clubid5") {
    time_run_temp <- ifelse(time > 9, 9, time)
  } else {
    time_run_temp <- ifelse(time > 7, 7, time)
  }
  
  # running frequency (times per week)
  freq_run <- array(c(clubdata[, "X.run_1.2019"], clubdata[, "X.run_2.2019"], clubdata[, "X.run_3.2019"],
                      clubdata[, "X.run_4.2019"], clubdata[, "X.run_5.2019"], clubdata[, "X.run_6.2019"], clubdata[,
                                                                                                                   "X.run_7.2019"], clubdata[, "X.run_8.2019"], clubdata[, "X.run_9.2019"], clubdata[, "X.run_10.2019"],
                      clubdata[, "X.run_11.2019"], clubdata[, "X.run_12.2019"]), dim = c(size, 1, n_waves))  # frequencies per month
  frequencies <- ceiling(freq_run/4)  # per week
  freq_run_temp <- ifelse(frequencies > 7, 7, frequencies)  # cap off at 7 times per week
  

  # we deal with composition change: actors joining the Strava club over time (we assume no leavers);
  # node set was determined at t12; from there, we 'scraped backward' in time;
  # down below, we count the number of waves, from t1 onward, that actors scored 0 on all DVs 
  # that is, where the sum of number of kudos in- and out-ties (network activity), frequency and volume (behavior activity), equals 0.
  # this number +1 is the wave in which actors are assumed to have joined 
  # our model will then assume that entries of joiners happen at the midpoint of the unobserved time period between this and the subsequent wave.
  
  sum_dv <- matrix(NA, nr=size, nc=n_waves)
  
  for (i in 1:size) {
    for (j in 1:n_waves) {
      sum_dv[i,j] <- sum( c( # sum of
        kudo_data[i,,j],     # number of out-ties of actor i at time j
        kudo_data[,i,j],     # number of in-ties of actor i at time j
        freq_run_temp[i,,j], # running frequency of i at j
        time_run_temp[i,,j]  # running volume of i at j
      ))
    }
  }

  t <- rep(1, size) # starting entry t=1
  for ( i in 1:size) {
    for( j in 1:n_waves) {
      
      # if sum = 0 for actor i in wave j, 
      # increment the entry period t with 1
      if(sum_dv[i,j] == 0) {
        t[i] = t[i] + 1
      }
      # if sum > 0, break the loop
      if(sum_dv[i,j] > 0) {
        break
      }
    }
  }
  # entry-waves cannot surpass 12:
  t[which(t>12)] <- 12
  
  # actors that join at t=12, cannot be included into the estimation;
  # they are structural zero;
  # we exclude the rows (i) and columns (j) corresponding to these actors
  # from the kudos matrices
  # we also remove these actors' elements from the running variable objects.
  
  # first, identify these actors (get their index)
  x <- which(t==12)
  
  # make new arrays to store entries of actors, excluding structural zeros
  kudo_data.nm <- array(NA, dim = c(size - length(x),
                                    size - length(x),
                                    n_waves))
  freq_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
  time_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
  
  # fill arrays  
  for (i in 1:n_waves) {
    kudo_data.nm[,,i] <- subset(kudo_data[,,i], select = -x)[-x, ]
    freq_run_temp.nm[,,i] <- freq_run_temp[,,i][-x]
    time_run_temp.nm[,,i] <- time_run_temp[,,i][-x]
  }
  
  # and adjust network size accordingly
  size.nm <- size - length(x)


  # create a list for the times of composition change
  # see 4.3.3 of RSIENA manual
  comp <- rep(list(NA), size.nm)
  for (i in 1:size.nm) {
    comp[[i]] <- c(t[-x][i],n_waves)
  }
  
  # RSiena GOF functions do not combine properly with the method of joiners and leavers
  # But if we use NA codes for the tie variables of absent actors, sienaGOF will work properly.

  # get time of joining for each actor:
  t_j <- t[which(t!=12)] 
  
  # for actors that are partially absent, set entries in the kudos-matrix to NA, 
  # for those waves they are absent, i.e., waves 1 : (t_j-1)
  for ( i in which(t_j>1)) {     # for actors that join later
    for (j in 1:n_waves) {       # for all waves
      if(t_j[i]>j) {             # if their time of joining comes after the current wave
        kudo_data.nm[i,,j] <- NA # set their out-degree entries to NA
        kudo_data.nm[,i,j] <- NA # but also their in-degree entries
        freq_run_temp.nm[i,1,j] <- NA # and their behaviors
        time_run_temp.nm[i,1,j] <- NA
      }g
    }
  }

  #  let's load other activity data (e.g., cycling, swimming) time
  time_ride <- array(c(clubdata[, "time_ride_1.2019"], clubdata[, "time_ride_2.2019"], clubdata[, "time_ride_3.2019"],
                       clubdata[, "time_ride_4.2019"], clubdata[, "time_ride_5.2019"], clubdata[, "time_ride_6.2019"],
                       clubdata[, "time_ride_7.2019"], clubdata[, "time_ride_8.2019"], clubdata[, "time_ride_9.2019"],
                       clubdata[, "time_ride_10.2019"], clubdata[, "time_ride_11.2019"], clubdata[, "time_ride_12.2019"]),
                     dim = c(size, 1, n_waves))
  time_other <- array(c(clubdata[, "time_other_1.2019"], clubdata[, "time_other_2.2019"], clubdata[,
                                                                                                   "time_other_3.2019"], clubdata[, "time_other_4.2019"], clubdata[, "time_other_5.2019"], clubdata[,
                                                                                                                                                                                                    "time_other_6.2019"], clubdata[, "time_other_7.2019"], clubdata[, "time_other_8.2019"], clubdata[,
                                                                                                                                                                                                                                                                                                     "time_other_12.2019"]), dim = c(size, 1, n_waves))

  time_other <- time_ride + time_other  # minutes per month
  time_other_h <- time_other/60  # hours per month
  # time_other_h <- time_other_h * 2 # half hours
  time <- ceiling(time_other_h/4)  #per week
  time_other_temp <- ifelse(time > 7, 7, time)  # cap off at 7 hours per week
  #table(time_other_temp)
  
  # frequency
  freq_ride <- array(c(clubdata[, "X.ride_1.2019"], clubdata[, "X.ride_2.2019"], clubdata[, "X.ride_3.2019"],
                       clubdata[, "X.ride_4.2019"], clubdata[, "X.ride_5.2019"], clubdata[, "X.ride_6.2019"], clubdata[,
                                                                                                                       "X.ride_7.2019"], clubdata[, "X.ride_8.2019"], clubdata[, "X.ride_9.2019"], clubdata[, "X.ride_10.2019"],
                       clubdata[, "X.ride_11.2019"], clubdata[, "X.ride_12.2019"]), dim = c(size, 1, n_waves))
  freq_other <- array(c(clubdata[, "X.other_1.2019"], clubdata[, "X.other_2.2019"], clubdata[, "X.other_3.2019"],
                        clubdata[, "X.other_4.2019"], clubdata[, "X.other_5.2019"], clubdata[, "X.other_6.2019"], clubdata[,
                                                                                                                           "X.other_7.2019"], clubdata[, "X.other_8.2019"], clubdata[, "X.other_9.2019"], clubdata[,
                                                                                                                                                                                                                   "X.other_10.2019"], clubdata[, "X.other_11.2019"], clubdata[, "X.other_12.2019"]), dim = c(size,
                                                                                                                                                                                                                                                                                                              1, n_waves))
  
  freq_other <- freq_ride + freq_other
  frequencies <- ceiling(freq_other/4)
  freq_other_temp <- ifelse(frequencies > 7, 7, frequencies)
  
  # and again, exclude the structural zeros:
  freq_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
  time_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
  for (i in 1:n_waves) {
    freq_other_temp.nm[,,i] <- freq_other_temp[,,i][-x]
    time_other_temp.nm[,,i] <- time_other_temp[,,i][-x]
  }
  
  # separating male/female/other
  
  male <- ifelse(clubdata[, "gender"][-x] == "M", 1,0)
  female <- ifelse(clubdata[, "gender"][-x] == "F", 1, 0)
  other <- ifelse(clubdata[, "gender"][-x] == "O", 1, 0)
  
  # specify months of winter in case we want to use it as a varying covariate starts with
  # december
  winter <- rep(c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), size.nm)
  winter <- matrix(winter, nr = size.nm, nc = n_waves, byrow = TRUE)
  
  # in case we want to condition on the first activity recorded by users:
  st_yr <- clubdata[,'start_date_year']
  st_yr <- st_yr[-x] #exclude NAs
  
  start_year <- ifelse(st_yr <= 2011, 1, 
                       ifelse(st_yr == 2012, 2,
                              ifelse(st_yr == 2013, 3,
                                     ifelse(st_yr == 2014, 4,
                                            ifelse(st_yr == 2015, 5,
                                                   ifelse(st_yr == 2016, 6,
                                                          ifelse(st_yr == 2017, 7,
                                                                 ifelse(st_yr == 2018, 8,
                                                                        9))))))))
  #hist(start_year)
  
  # create a list containing all the read club data for the current club
  club <- list(friendship = friend_data, kudo = kudo_data.nm, freq_run = freq_run_temp.nm, time_run = time_run_temp.nm,
               freq_other = freq_other_temp.nm, time_other = time_other_temp.nm, winter = winter, male = male, female = female,
               other = other, netsize = size.nm, composition = comp, novice=start_year)
  
  
  # save the club object
  save(club, file = paste("clubs/", "club", club_id, ".RData", sep = ""))
  
}


####################################################

# Now that we have saved the clubdata for all clubs, let's combine them in one list

# first clean the working directory, except our club string; we still need that one.
# and our function fload.R
rm(list = setdiff(ls(), c("club_str", "fload.R")))


# load in the separate club-objects
club1 <- fload.R(paste("clubs", "/", "club", club_str[1], ".RData", sep=""))
club2 <- fload.R(paste("clubs", "/", "club", club_str[2], ".RData", sep=""))
club3 <- fload.R(paste("clubs", "/", "club", club_str[3], ".RData", sep=""))
club4 <- fload.R(paste("clubs", "/", "club", club_str[4], ".RData", sep=""))
club5 <- fload.R(paste("clubs", "/", "club", club_str[5], ".RData", sep=""))

# and make a list containing all the clubdata
clubdata <- list(club1, club2, club3, club4, club5)

# save the output
save(clubdata, file = "clubs/clubdata.RData")

clubdata_rsiena.RData

The following script creates a list containing RSiena objects for all clubs. We create a list containing RSiena objects with running frequency as the behavior variable (clubdata_rsiena_freq.RData) and running volume (clubdata_rsiena_vol.RData).

Note: in some groups, dependent variables (either the kudos-network or running behaviors) only have upward or downward changes in particular periods. We lift RSiena’s automatic restriction to follow this pattern in the simulations, by using ‘allowOnly = FALSE’. This must be done for the subsequent meta-analysis.

# clean the working environment 
rm (list = ls( ))

# load the clubdata
load("clubs/clubdata.RData")
str(clubdata) # inspect structure
# clubdata is a list of 5 lists, 
# with each of these lists containing data of the corresponding club.

####################################################


clubdata_rsiena_freq <- list()
clubdata_rsiena_vol <- list()


for (i in 1:5) { 
  club <- clubdata[[i]]
  # specify the roles of variables
  names(club)
  
  # A: network variables
  kudonet <- sienaDependent(club$kudo, allowOnly = FALSE)  #at least one Kudo
  
  # B: behavioral variables
  time_run <- sienaDependent(club$time_run, type= "behavior", allowOnly = FALSE)
  freq_run <- sienaDependent(club$freq_run, type= "behavior", allowOnly = FALSE)
  
  time_other <- varCovar(club$time_other[,,])
  freq_other <- varCovar(club$freq_other[,,])

  # C: covariates
  winter <- varCovar(club$winter)
  gender <- NA #we dichotomize gender as binary (men vs. women and other)
  gender <- ifelse(club$male == 1, 1, gender)
  gender <- ifelse(club$female == 1, 2, gender)
  gender <- ifelse(club$other == 1, 2, gender)
  gender <- coCovar(gender)
  novice <- coCovar(club$novice)
  
  
  # D: composition change (joiners)
  joiners <- sienaCompositionChange(club$composition)
  
  # now combine the dependent and independent variables in a data object
  mydata <- sienaDataCreate(kudonet, freq_run, time_other, freq_other, gender, winter, joiners, novice)
  mydata2 <- sienaDataCreate(kudonet, time_run, time_other, freq_other, gender, winter, joiners, novice)

  # this finishes the data specification
  clubdata_rsiena_freq[[i]] <- mydata
  clubdata_rsiena_vol[[i]] <- mydata2
} 


#clean environment
rm(list = setdiff(ls(), c("clubdata_rsiena_freq", "clubdata_rsiena_vol")))

# save the output
save(clubdata_rsiena_freq, file = "clubs/clubdata_rsiena_freq.RData")
save(clubdata_rsiena_vol, file = "clubs/clubdata_rsiena_vol.RData")

---
title: "Data preparation"
bibliography: references.bib
date: "Last compiled on `r format(Sys.time(), '%B, %Y')`"
output: 
  html_document:
    css: tweaks.css
    toc:  true
    toc_float: true
    number_sections: false
    toc_depth: 1
    code_folding: show
    code_download: yes
---

```{r, globalsettings, echo=FALSE, warning=FALSE, results='hide'}
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE, warning = FALSE, message = FALSE,comment = "#>", cache=TRUE, class.source=c("test"), class.output=c("test2"))
options(width = 100)
rgl::setupKnitr()


colorize <- function(x, color) {sprintf("<span style='color: %s;'>%s</span>", color, x) }
```


```{r klippy, echo=FALSE, include=TRUE}
klippy::klippy(position = c('top', 'right'))
#klippy::klippy(color = 'darkred')
#klippy::klippy(tooltip_message = 'Click to copy', tooltip_success = 'Done')
```


---  

# Getting started


## clean up

```{r, results='hide'}
rm(list=ls())
```

<br>

## general custom functions

- `fpackage.check`: Check if packages are installed (and install if not) in R ([source](https://vbaliga.github.io/verify-that-r-packages-are-installed-and-loaded/))
- `fload.R`: function to load R-objects under new names.
- `fidmaker`: creating mock IDs

```{r, eval=FALSE}

fpackage.check <- function(packages) {
    lapply(packages, FUN = function(x) {
        if (!require(x, character.only = TRUE)) {
            install.packages(x, dependencies = TRUE)
            library(x, character.only = TRUE)
        }
    })
}

fload.R  <- function(fileName){
  load(fileName)
  get(ls()[ls() != "fileName"])
}

fidmaker <- function(x) {
    max.val = x * 1e+05
    count <- nchar(as.character(max.val))  # find out how many 'numbers' each id will have after the letter
    size <- paste("%0", count, "d", sep = "")  # set the variable to be fed into 'sprintf' to ensure we have leading 0's
    lets <- toupper(sample(letters, x, replace = T))  # randomizing the letters 
    nums <- sprintf(size, sample(1:max.val)[1:x])  # randomizing the numbers, and ensuring they all have the same number of characters
    ids <- paste(lets, nums, sep = "")  # joining them together
    return(ids)
  }

```

<br>

## necessary packages

- `RSiena`: creating RSiena data objects
- `dplyr`: package for data wrangling

```{r, eval=FALSE}
packages = c("RSiena", "dplyr")
fpackage.check(packages)
```

<br>
----

# clubdata.RData

In the following scripts a list containing the (anonymized) data of all clubs is made (clubdata.RData).

* Our primary network variable is the Kudos-network. A tie i -> j exists if ego i award at least 1 Kudos to alter j.

* For the behavioral data we include information on the *frequency* (i.e., in times per week) and *volume* (i.e., in hours per week) of running activities. We included activity (frequency and volume) in other sports (e.g., cycling and swimming) as a time-varying covariate.


```{r clubs, eval=FALSE}

# club string represents the club ID
club_str <- c("clubid1", "clubid2", "clubid3", "clubid4" ,"clubid5") 

# the following script reads the data of the clubs from the folder for each club, stores them in a list, and saves it in an object in the last function call of this script. 


for (i in (1: length(club_str))) {
  
  club_id <- club_str[i]
  
  # read the data from the club
  clubdata <- read.csv(paste("clubs/", club_id, "/", "egoData_extended.csv", sep = ""), row.names = NULL,
                       sep = ",")
  # saving club size
  size <- length(clubdata[, "gender"])
  # the number of months that we want to add as waves
  n_waves <- 12
  
  # let's load the friendship/following network
  friend_data <- as.matrix(read.csv(paste("clubs/", club_id, "/", "socialnetwork.csv", sep = ""), row.names = NULL,
                                    sep = ","))
  # remove the first column (represents index made in the csv)
  friend_data <- friend_data[, 2:ncol(friend_data)]
  
  # and anonymize the user id, with our id-maker function
  
  fakeid <- fidmaker(nrow(friend_data))  # generate random id
  colnames(friend_data) <- fakeid  # anonymizing users
  
  # let's load the kudos network
  path <- paste("clubs/", club_id, "/", "kudos", sep = "")  # create path
  {
    kudo_w1 <- as.matrix(read.csv(paste(path, "1-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w1 <- kudo_w1[, 2:ncol(kudo_w1)]
    kudo_w2 <- as.matrix(read.csv(paste(path, "2-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w2 <- kudo_w2[, 2:ncol(kudo_w2)]
    kudo_w3 <- as.matrix(read.csv(paste(path, "3-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w3 <- kudo_w3[, 2:ncol(kudo_w3)]
    kudo_w4 <- as.matrix(read.csv(paste(path, "4-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w4 <- kudo_w4[, 2:ncol(kudo_w4)]
    kudo_w5 <- as.matrix(read.csv(paste(path, "5-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w5 <- kudo_w5[, 2:ncol(kudo_w5)]
    kudo_w6 <- as.matrix(read.csv(paste(path, "6-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w6 <- kudo_w6[, 2:ncol(kudo_w6)]
    kudo_w7 <- as.matrix(read.csv(paste(path, "7-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w7 <- kudo_w7[, 2:ncol(kudo_w7)]
    kudo_w8 <- as.matrix(read.csv(paste(path, "8-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w8 <- kudo_w8[, 2:ncol(kudo_w8)]
    kudo_w9 <- as.matrix(read.csv(paste(path, "9-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w9 <- kudo_w9[, 2:ncol(kudo_w9)]
    kudo_w10 <- as.matrix(read.csv(paste(path, "10-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w10 <- kudo_w10[, 2:ncol(kudo_w10)]
    kudo_w11 <- as.matrix(read.csv(paste(path, "11-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w11 <- kudo_w11[, 2:ncol(kudo_w11)]
    kudo_w12 <- as.matrix(read.csv(paste(path, "12-2019.csv", sep = ""), row.names = NULL, sep = ","))
    kudo_w12 <- kudo_w12[, 2:ncol(kudo_w12)]
  }
  kudos <- array(c(kudo_w1, kudo_w2, kudo_w3, kudo_w4, kudo_w5, kudo_w6, kudo_w7, kudo_w8, kudo_w9,
                   kudo_w10, kudo_w11, kudo_w12), dim = c(size, size, n_waves))  #Kudos matrix
  
  kudo_data <- ifelse(kudos > 0, 1, 0)  #if at least 1 Kudo is send, tie exists
  
  # running time (in hours per week)
  time_run <- array(c(clubdata[, "time_run_1.2019"], clubdata[, "time_run_2.2019"], clubdata[, "time_run_3.2019"],
                      clubdata[, "time_run_4.2019"], clubdata[, "time_run_5.2019"], clubdata[, "time_run_6.2019"],
                      clubdata[, "time_run_7.2019"], clubdata[, "time_run_8.2019"], clubdata[, "time_run_9.2019"],
                      clubdata[, "time_run_10.2019"], clubdata[, "time_run_11.2019"], clubdata[, "time_run_12.2019"]),
                    dim = c(size, 1, n_waves))  # minutes per month
  time_run_h <- time_run/60  # hours per month
  # time_run_30 <- time_run_h * 2 # half hours
  time <- ceiling(time_run_h/4)  # per week
  
  # provide some descriptives 
  # time %>% # absolute table() time %>% # proportionate table() %>%
  # prop.table() %>% round(3) cumprop <- time %>% # cumulative proportions table() %>%
  # prop.table() %>% round(3) %>% cumsum() print(cumprop) 1-cumprop # the percentage of all
  # values that is higher than the particular value
  
  # our idea was to cap running time of at 7+ hours per week.  this resulted in rather smooth
  # (right-skewed) distribution of values for most clubs except for club 5, where 7+ hours
  # category was highly populated over time.  We added 2 extra categories (capped of at 9+
  # hours); this resulted in a rather smooth distribution for this club.
  
  if (club_id == "clubid5") {
    time_run_temp <- ifelse(time > 9, 9, time)
  } else {
    time_run_temp <- ifelse(time > 7, 7, time)
  }
  
  # running frequency (times per week)
  freq_run <- array(c(clubdata[, "X.run_1.2019"], clubdata[, "X.run_2.2019"], clubdata[, "X.run_3.2019"],
                      clubdata[, "X.run_4.2019"], clubdata[, "X.run_5.2019"], clubdata[, "X.run_6.2019"], clubdata[,
                                                                                                                   "X.run_7.2019"], clubdata[, "X.run_8.2019"], clubdata[, "X.run_9.2019"], clubdata[, "X.run_10.2019"],
                      clubdata[, "X.run_11.2019"], clubdata[, "X.run_12.2019"]), dim = c(size, 1, n_waves))  # frequencies per month
  frequencies <- ceiling(freq_run/4)  # per week
  freq_run_temp <- ifelse(frequencies > 7, 7, frequencies)  # cap off at 7 times per week
  

  # we deal with composition change: actors joining the Strava club over time (we assume no leavers);
  # node set was determined at t12; from there, we 'scraped backward' in time;
  # down below, we count the number of waves, from t1 onward, that actors scored 0 on all DVs 
  # that is, where the sum of number of kudos in- and out-ties (network activity), frequency and volume (behavior activity), equals 0.
  # this number +1 is the wave in which actors are assumed to have joined 
  # our model will then assume that entries of joiners happen at the midpoint of the unobserved time period between this and the subsequent wave.
  
  sum_dv <- matrix(NA, nr=size, nc=n_waves)
  
  for (i in 1:size) {
    for (j in 1:n_waves) {
      sum_dv[i,j] <- sum( c( # sum of
        kudo_data[i,,j],     # number of out-ties of actor i at time j
        kudo_data[,i,j],     # number of in-ties of actor i at time j
        freq_run_temp[i,,j], # running frequency of i at j
        time_run_temp[i,,j]  # running volume of i at j
      ))
    }
  }

  t <- rep(1, size) # starting entry t=1
  for ( i in 1:size) {
    for( j in 1:n_waves) {
      
      # if sum = 0 for actor i in wave j, 
      # increment the entry period t with 1
      if(sum_dv[i,j] == 0) {
        t[i] = t[i] + 1
      }
      # if sum > 0, break the loop
      if(sum_dv[i,j] > 0) {
        break
      }
    }
  }
  # entry-waves cannot surpass 12:
  t[which(t>12)] <- 12
  
  # actors that join at t=12, cannot be included into the estimation;
  # they are structural zero;
  # we exclude the rows (i) and columns (j) corresponding to these actors
  # from the kudos matrices
  # we also remove these actors' elements from the running variable objects.
  
  # first, identify these actors (get their index)
  x <- which(t==12)
  
  # make new arrays to store entries of actors, excluding structural zeros
  kudo_data.nm <- array(NA, dim = c(size - length(x),
                                    size - length(x),
                                    n_waves))
  freq_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
  time_run_temp.nm <- array(NA, dim = c(size - length(x), 1, n_waves))
  
  # fill arrays  
  for (i in 1:n_waves) {
    kudo_data.nm[,,i] <- subset(kudo_data[,,i], select = -x)[-x, ]
    freq_run_temp.nm[,,i] <- freq_run_temp[,,i][-x]
    time_run_temp.nm[,,i] <- time_run_temp[,,i][-x]
  }
  
  # and adjust network size accordingly
  size.nm <- size - length(x)


  # create a list for the times of composition change
  # see 4.3.3 of RSIENA manual
  comp <- rep(list(NA), size.nm)
  for (i in 1:size.nm) {
    comp[[i]] <- c(t[-x][i],n_waves)
  }
  
  # RSiena GOF functions do not combine properly with the method of joiners and leavers
  # But if we use NA codes for the tie variables of absent actors, sienaGOF will work properly.

  # get time of joining for each actor:
  t_j <- t[which(t!=12)] 
  
  # for actors that are partially absent, set entries in the kudos-matrix to NA, 
  # for those waves they are absent, i.e., waves 1 : (t_j-1)
  for ( i in which(t_j>1)) {     # for actors that join later
    for (j in 1:n_waves) {       # for all waves
      if(t_j[i]>j) {             # if their time of joining comes after the current wave
        kudo_data.nm[i,,j] <- NA # set their out-degree entries to NA
        kudo_data.nm[,i,j] <- NA # but also their in-degree entries
        freq_run_temp.nm[i,1,j] <- NA # and their behaviors
        time_run_temp.nm[i,1,j] <- NA
      }g
    }
  }

  #  let's load other activity data (e.g., cycling, swimming) time
  time_ride <- array(c(clubdata[, "time_ride_1.2019"], clubdata[, "time_ride_2.2019"], clubdata[, "time_ride_3.2019"],
                       clubdata[, "time_ride_4.2019"], clubdata[, "time_ride_5.2019"], clubdata[, "time_ride_6.2019"],
                       clubdata[, "time_ride_7.2019"], clubdata[, "time_ride_8.2019"], clubdata[, "time_ride_9.2019"],
                       clubdata[, "time_ride_10.2019"], clubdata[, "time_ride_11.2019"], clubdata[, "time_ride_12.2019"]),
                     dim = c(size, 1, n_waves))
  time_other <- array(c(clubdata[, "time_other_1.2019"], clubdata[, "time_other_2.2019"], clubdata[,
                                                                                                   "time_other_3.2019"], clubdata[, "time_other_4.2019"], clubdata[, "time_other_5.2019"], clubdata[,
                                                                                                                                                                                                    "time_other_6.2019"], clubdata[, "time_other_7.2019"], clubdata[, "time_other_8.2019"], clubdata[,
                                                                                                                                                                                                                                                                                                     "time_other_12.2019"]), dim = c(size, 1, n_waves))

  time_other <- time_ride + time_other  # minutes per month
  time_other_h <- time_other/60  # hours per month
  # time_other_h <- time_other_h * 2 # half hours
  time <- ceiling(time_other_h/4)  #per week
  time_other_temp <- ifelse(time > 7, 7, time)  # cap off at 7 hours per week
  #table(time_other_temp)
  
  # frequency
  freq_ride <- array(c(clubdata[, "X.ride_1.2019"], clubdata[, "X.ride_2.2019"], clubdata[, "X.ride_3.2019"],
                       clubdata[, "X.ride_4.2019"], clubdata[, "X.ride_5.2019"], clubdata[, "X.ride_6.2019"], clubdata[,
                                                                                                                       "X.ride_7.2019"], clubdata[, "X.ride_8.2019"], clubdata[, "X.ride_9.2019"], clubdata[, "X.ride_10.2019"],
                       clubdata[, "X.ride_11.2019"], clubdata[, "X.ride_12.2019"]), dim = c(size, 1, n_waves))
  freq_other <- array(c(clubdata[, "X.other_1.2019"], clubdata[, "X.other_2.2019"], clubdata[, "X.other_3.2019"],
                        clubdata[, "X.other_4.2019"], clubdata[, "X.other_5.2019"], clubdata[, "X.other_6.2019"], clubdata[,
                                                                                                                           "X.other_7.2019"], clubdata[, "X.other_8.2019"], clubdata[, "X.other_9.2019"], clubdata[,
                                                                                                                                                                                                                   "X.other_10.2019"], clubdata[, "X.other_11.2019"], clubdata[, "X.other_12.2019"]), dim = c(size,
                                                                                                                                                                                                                                                                                                              1, n_waves))
  
  freq_other <- freq_ride + freq_other
  frequencies <- ceiling(freq_other/4)
  freq_other_temp <- ifelse(frequencies > 7, 7, frequencies)
  
  # and again, exclude the structural zeros:
  freq_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
  time_other_temp.nm <- array(NA, dim = c(size.nm, 1, n_waves))
  for (i in 1:n_waves) {
    freq_other_temp.nm[,,i] <- freq_other_temp[,,i][-x]
    time_other_temp.nm[,,i] <- time_other_temp[,,i][-x]
  }
  
  # separating male/female/other
  
  male <- ifelse(clubdata[, "gender"][-x] == "M", 1,0)
  female <- ifelse(clubdata[, "gender"][-x] == "F", 1, 0)
  other <- ifelse(clubdata[, "gender"][-x] == "O", 1, 0)
  
  # specify months of winter in case we want to use it as a varying covariate starts with
  # december
  winter <- rep(c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), size.nm)
  winter <- matrix(winter, nr = size.nm, nc = n_waves, byrow = TRUE)
  
  # in case we want to condition on the first activity recorded by users:
  st_yr <- clubdata[,'start_date_year']
  st_yr <- st_yr[-x] #exclude NAs
  
  start_year <- ifelse(st_yr <= 2011, 1, 
                       ifelse(st_yr == 2012, 2,
                              ifelse(st_yr == 2013, 3,
                                     ifelse(st_yr == 2014, 4,
                                            ifelse(st_yr == 2015, 5,
                                                   ifelse(st_yr == 2016, 6,
                                                          ifelse(st_yr == 2017, 7,
                                                                 ifelse(st_yr == 2018, 8,
                                                                        9))))))))
  #hist(start_year)
  
  # create a list containing all the read club data for the current club
  club <- list(friendship = friend_data, kudo = kudo_data.nm, freq_run = freq_run_temp.nm, time_run = time_run_temp.nm,
               freq_other = freq_other_temp.nm, time_other = time_other_temp.nm, winter = winter, male = male, female = female,
               other = other, netsize = size.nm, composition = comp, novice=start_year)
  
  
  # save the club object
  save(club, file = paste("clubs/", "club", club_id, ".RData", sep = ""))
  
}


####################################################

# Now that we have saved the clubdata for all clubs, let's combine them in one list

# first clean the working directory, except our club string; we still need that one.
# and our function fload.R
rm(list = setdiff(ls(), c("club_str", "fload.R")))


# load in the separate club-objects
club1 <- fload.R(paste("clubs", "/", "club", club_str[1], ".RData", sep=""))
club2 <- fload.R(paste("clubs", "/", "club", club_str[2], ".RData", sep=""))
club3 <- fload.R(paste("clubs", "/", "club", club_str[3], ".RData", sep=""))
club4 <- fload.R(paste("clubs", "/", "club", club_str[4], ".RData", sep=""))
club5 <- fload.R(paste("clubs", "/", "club", club_str[5], ".RData", sep=""))

# and make a list containing all the clubdata
clubdata <- list(club1, club2, club3, club4, club5)

# save the output
save(clubdata, file = "clubs/clubdata.RData")


``` 

---  

# clubdata_rsiena.RData

The following script creates a list containing RSiena objects for all clubs. 
We create a list containing RSiena objects with running frequency as the behavior variable (clubdata_rsiena_freq.RData) and running volume (clubdata_rsiena_vol.RData). 

Note: in some groups, dependent variables (either the kudos-network or running behaviors) only have upward or downward changes in particular periods. We lift RSiena's automatic restriction to  follow this pattern in the simulations, by using '*allowOnly* = FALSE'. This must be done for the subsequent meta-analysis.

```{r eval=FALSE}
# clean the working environment 
rm (list = ls( ))

# load the clubdata
load("clubs/clubdata.RData")
str(clubdata) # inspect structure
# clubdata is a list of 5 lists, 
# with each of these lists containing data of the corresponding club.

####################################################


clubdata_rsiena_freq <- list()
clubdata_rsiena_vol <- list()


for (i in 1:5) { 
  club <- clubdata[[i]]
  # specify the roles of variables
  names(club)
  
  # A: network variables
  kudonet <- sienaDependent(club$kudo, allowOnly = FALSE)  #at least one Kudo
  
  # B: behavioral variables
  time_run <- sienaDependent(club$time_run, type= "behavior", allowOnly = FALSE)
  freq_run <- sienaDependent(club$freq_run, type= "behavior", allowOnly = FALSE)
  
  time_other <- varCovar(club$time_other[,,])
  freq_other <- varCovar(club$freq_other[,,])

  # C: covariates
  winter <- varCovar(club$winter)
  gender <- NA #we dichotomize gender as binary (men vs. women and other)
  gender <- ifelse(club$male == 1, 1, gender)
  gender <- ifelse(club$female == 1, 2, gender)
  gender <- ifelse(club$other == 1, 2, gender)
  gender <- coCovar(gender)
  novice <- coCovar(club$novice)
  
  
  # D: composition change (joiners)
  joiners <- sienaCompositionChange(club$composition)
  
  # now combine the dependent and independent variables in a data object
  mydata <- sienaDataCreate(kudonet, freq_run, time_other, freq_other, gender, winter, joiners, novice)
  mydata2 <- sienaDataCreate(kudonet, time_run, time_other, freq_other, gender, winter, joiners, novice)

  # this finishes the data specification
  clubdata_rsiena_freq[[i]] <- mydata
  clubdata_rsiena_vol[[i]] <- mydata2
} 


#clean environment
rm(list = setdiff(ls(), c("clubdata_rsiena_freq", "clubdata_rsiena_vol")))

# save the output
save(clubdata_rsiena_freq, file = "clubs/clubdata_rsiena_freq.RData")
save(clubdata_rsiena_vol, file = "clubs/clubdata_rsiena_vol.RData")
```



---




Copyright © 2021 Rob Franken