# This code implements Algorithm 1 # "Label propagation for NPI" # from Mizuno, Doi, and Kurizaki (2020, p. 7) # (https://doi.org/10.1371/journal.pone.0237862) library(data.table) # Function to compute Network or Direct power index # INPUTS: a data.table with participant-entity relations # in long form or an igraph object with edge weights # OUTPUTS: data.table with power index for each participant # or named list with such table and labels from each iteration # (if save_labels = T). compute_power_index <- function(relations_object, participant_var = "participant", entity_var = "entity", weight_var = "weight", powerindex = "dpi", # which power index to compute iterations = 100, # how many iterations to run quota = 50, # majority rule, % epsilon = 0.01, # probability to restore the original label when computing the NPI iterations_to_discard = 20, # how many iterations to discard when computing the NPI save_labels = FALSE # whether to save labels from each iteration ) { # Debug: powerindex <- "dpi"; iterations <- 100; quota <- 50; epsilon <- 0.01; iterations_to_discard <- 20; save_labels <- F; participant_var = "participant"; entity_var = "entity"; weight_var = "voting_share" # Input checks ## Does the relations object exist and is it in correct form? stopifnot("relations_object is not specified" = !missing(relations_object)) ## If the relations object is in igraph transform to a data.frame if( "igraph" %in% class(relations_object) ) { relations_object <- igraph::as_data_frame(relations_object, what = "edges") # Check whether a weight column is present in the data (i.e. data.frame has at least 3 columns stopifnot("`relations_object` graph does not have edge weights" = ncol(relations_object) >= 3) # Proper names names(relations_object)[1:2] <- c(participant_var, entity_var) } ## If the relations object is not a data.table try to coerce to it if( !("data.table" %in% class(relations_object)) ) { data.table::setDT(relations_object) } ## Are the participant and entity variables in the data? stopifnot("`participant_var` variable not found in `relations_object`" = participant_var %in% names(relations_object)) stopifnot("`entity_var` variable not found in `relations_object`" = entity_var %in% names(relations_object)) stopifnot("`weight_var` variable not found in `relations_object`" = weight_var %in% names(relations_object)) ## Are the power index options correctly specified? stopifnot("`powerindex` should be either dpi or npi" = powerindex %in% c("npi", "dpi")) stopifnot("`save_labels` should be either TRUE or FALSE" = save_labels %in% c(TRUE, FALSE)) ## Are the parameters of the required type? stopifnot("`iterations should` be an integer" = all.equal(iterations, as.integer(iterations)) ) stopifnot("`iterations_to_discard` should be an integer not exceeding `iterations`" = (all.equal(iterations_to_discard, as.integer(iterations_to_discard)) & iterations_to_discard < iterations) | powerindex == "dpi" ) min_weight <- min(relations_object[[weight_var]], na.rm = T) max_weight <- max(relations_object[[weight_var]], na.rm = T) stopifnot("`weight_var` should take non-negative numeric values" = min_weight >= 0 & is.finite(max_weight) & is.finite(min_weight) ) stopifnot("epsilon should be in [0,1] range" = (epsilon >= 0 & epsilon <= 1) | powerindex == "dpi" ) ## Disable the `iterations_to_discard` option ## when computing DPI if( powerindex == "dpi" ) { iterations_to_discard <- 0 } # Remove observations with missing data ## Participant if( nrow(relations_object[ is.na(get(participant_var)) ]) > 0 ) { warning(paste0("Removing rows with missing ", participant_var, " before computation")) relations_object <- relations_object[ !is.na(get(participant_var)) ] } ## Entity if( nrow(relations_object[ is.na(get(entity_var)) ]) > 0 ) { warning(paste0("Removing rows with missing ", entity_var, " before computation")) relations_object <- relations_object[ !is.na(get(entity_var)) ] } ## Weight if( nrow(relations_object[ is.na(get(weight_var)) ]) > 0 ) { warning(paste0("Removing rows with missing ", weight_var, " before computation")) relations_object <- relations_object[ !is.na(get(weight_var)) ] } # Init an object to update labels in each iteration updating_relations <- copy(relations_object[, c(participant_var, entity_var, weight_var), with = F]) setnames(updating_relations, c("participant", "entity", "weight")) # Init an object to store per-iteration # entity labels (i.e. who the pivotal # shareholder is) if( save_labels == T) { iteration_labels <- data.table() # Otherwise we store overall counts of each # label being named a pivot } else { pivot_counts <- updating_relations[, c("participant", "entity")] pivot_counts[, times_pivotal := 0 ] } # Create a progress bar to report on iteration progess message(paste0("Iterating through ", iterations, " permutations:")) iterations_progess_bar <- txtProgressBar(min = 0, max = iterations, style = 3) for(i in 1:iterations) { # Randomly order rows within entities updating_relations[, random_var := runif(.N)] setorderv(updating_relations, c("entity", "random_var")) # Breaking cycles when computing NPI: restore original # labels for the random set of rows with epsilon # probability if( powerindex == "npi" & epsilon > 0 ) { entities_to_restore <- unique(updating_relations[ random_var <= epsilon ]$entity) if( length(entities_to_restore) > 0 ) { restored_entities <- relations_object[ get(entity_var) %in% entities_to_restore, c(participant_var, entity_var, weight_var), with = F] setnames(restored_entities, c("participant", "entity", "weight")) # Randomly shuffle the labels to restore as well respect random ordering restored_entities[, random_var := runif(.N) ] setorderv(restored_entities, c("entity", "random_var")) updating_relations <- rbind(updating_relations[ !( entity %in% entities_to_restore) ], restored_entities, fill = T) } } # Compute cumulative sums of voting rights by entity updating_relations[, cumulative_weight := cumsum(weight), by = "entity"] # Lagged cumulative voting share # shift() is not optimized with many groups # (https://github.com/Rdatatable/data.table/issues/1534) # so we cannot simply use `by` statement as below # updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1), by = "entity"] # # Instead, consider a two-step hack that uses our entity ordering # to mark as missing lagged weights for the first participant per entity updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1)] updating_relations[, entity_l1 := shift(entity, type = "lag", n = 1)] updating_relations[entity != entity_l1, cumulative_weight_l1 := NA_real_ ] # Identify pivots pivots <- updating_relations[ cumulative_weight >= quota & (cumulative_weight_l1 < quota | is.na(cumulative_weight_l1) ), c("participant", "entity") ] # Store pivots from each iteration in a designated object if( save_labels == T) { iteration_labels <- rbind(iteration_labels, data.table(iteration = i, pivots), fill = T) # NB: ^ this is the most problematic step in terms of scaling. # Where to store and access an N * number of iterations table # with labels when N is huge and iterations = 20 000? We need # to save pivots for ~3 million entities 20 thousand times. # # Alternatively you can write pivots from each iteration to a separate CSV # but you will still end up with 20 thousand files, each having ~3 million lines. } # We can avoid storing the pivots altogether and just # add to the total count of times a given partipant is # named pivotal in a given entity. We also discard # the first `iterations_to_discard` iterations if( save_labels == F & i > iterations_to_discard ) { # We row bind the current pivots to the existing counts # and then sum. This might be faster than merging on # entity and participant. # In fact, it is very slow pivot_counts <- rbind(pivot_counts, data.table(pivots, times_pivotal = 1), fill = T) pivot_counts <- pivot_counts[, list( times_pivotal = sum(times_pivotal) ), by = c("entity", "participant")] } # When computing NPI: update the participant # labels with pivots identified during this # iteration step if( powerindex == "npi" ) { updating_relations[, participant_update := pivots[ match(updating_relations$participant, pivots$entity) ]$participant ] updating_relations[ !is.na(participant_update), participant := participant_update ] updating_relations[, participant_update := NA ] # Sum up the updated shares for the next iteration updating_relations <- updating_relations[, list(weight = sum(weight)), by = c("entity", "participant")] } # Update the progress bar setTxtProgressBar(iterations_progess_bar, i) } # Compute the index itself if( save_labels == T ) { # Count the number of times this # participant was named the pivotal player, # discarding first `iterations_to_discard` iterations index_values <- iteration_labels[iteration > iterations_to_discard, list(times_pivotal = .N), by = c("entity", "participant")] } else { index_values <- pivot_counts } # Compute the index index_values[, index := times_pivotal/(iterations - iterations_to_discard) ] setnames(index_values, "index", powerindex) if( save_labels == T ) { return(list(index_values = index_values, iteration_labels = iteration_labels)) } else { return(index_values) } }