alphaicon/code/helper_functions/compute_power_index.r
Dmitriy Skougarevskiy 3b419bab6e Initial commit
2021-09-16 10:06:49 +03:00

249 lines
9.5 KiB
R

# This code implements Algorithm 1
# "Label propagation for NPI"
# from Mizuno, Doi, and Kurizaki (2020, p. 7)
# (https://doi.org/10.1371/journal.pone.0237862)
library(data.table)
# Function to compute Network or Direct power index
# INPUTS: a data.table with participant-entity relations
# in long form or an igraph object with edge weights
# OUTPUTS: data.table with power index for each participant
# or named list with such table and labels from each iteration
# (if save_labels = T).
compute_power_index <- function(relations_object,
participant_var = "participant",
entity_var = "entity",
weight_var = "weight",
powerindex = "dpi", # which power index to compute
iterations = 100, # how many iterations to run
quota = 50, # majority rule, %
epsilon = 0.01, # probability to restore the original label when computing the NPI
iterations_to_discard = 20, # how many iterations to discard when computing the NPI
save_labels = FALSE # whether to save labels from each iteration
) {
# Debug: powerindex <- "dpi"; iterations <- 100; quota <- 50; epsilon <- 0.01; iterations_to_discard <- 20; save_labels <- F; participant_var = "participant"; entity_var = "entity"; weight_var = "voting_share"
# Input checks
## Does the relations object exist and is it in correct form?
stopifnot("relations_object is not specified" = !missing(relations_object))
## If the relations object is in igraph transform to a data.frame
if( "igraph" %in% class(relations_object) ) {
relations_object <- igraph::as_data_frame(relations_object, what = "edges")
# Check whether a weight column is present in the data (i.e. data.frame has at least 3 columns
stopifnot("`relations_object` graph does not have edge weights" = ncol(relations_object) >= 3)
# Proper names
names(relations_object)[1:2] <- c(participant_var, entity_var)
}
## If the relations object is not a data.table try to coerce to it
if( !("data.table" %in% class(relations_object)) ) {
data.table::setDT(relations_object)
}
## Are the participant and entity variables in the data?
stopifnot("`participant_var` variable not found in `relations_object`" = participant_var %in% names(relations_object))
stopifnot("`entity_var` variable not found in `relations_object`" = entity_var %in% names(relations_object))
stopifnot("`weight_var` variable not found in `relations_object`" = weight_var %in% names(relations_object))
## Are the power index options correctly specified?
stopifnot("`powerindex` should be either dpi or npi" = powerindex %in% c("npi", "dpi"))
stopifnot("`save_labels` should be either TRUE or FALSE" = save_labels %in% c(TRUE, FALSE))
## Are the parameters of the required type?
stopifnot("`iterations should` be an integer" = all.equal(iterations, as.integer(iterations)) )
stopifnot("`iterations_to_discard` should be an integer not exceeding `iterations`" = (all.equal(iterations_to_discard, as.integer(iterations_to_discard)) & iterations_to_discard < iterations) | powerindex == "dpi" )
min_weight <- min(relations_object[[weight_var]], na.rm = T)
max_weight <- max(relations_object[[weight_var]], na.rm = T)
stopifnot("`weight_var` should take non-negative numeric values" = min_weight >= 0 & is.finite(max_weight) & is.finite(min_weight) )
stopifnot("epsilon should be in [0,1] range" = (epsilon >= 0 & epsilon <= 1) | powerindex == "dpi" )
## Disable the `iterations_to_discard` option
## when computing DPI
if( powerindex == "dpi" ) {
iterations_to_discard <- 0
}
# Remove observations with missing data
## Participant
if( nrow(relations_object[ is.na(get(participant_var)) ]) > 0 ) {
warning(paste0("Removing rows with missing ", participant_var, " before computation"))
relations_object <- relations_object[ !is.na(get(participant_var)) ]
}
## Entity
if( nrow(relations_object[ is.na(get(entity_var)) ]) > 0 ) {
warning(paste0("Removing rows with missing ", entity_var, " before computation"))
relations_object <- relations_object[ !is.na(get(entity_var)) ]
}
## Weight
if( nrow(relations_object[ is.na(get(weight_var)) ]) > 0 ) {
warning(paste0("Removing rows with missing ", weight_var, " before computation"))
relations_object <- relations_object[ !is.na(get(weight_var)) ]
}
# Init an object to update labels in each iteration
updating_relations <- copy(relations_object[, c(participant_var, entity_var, weight_var), with = F])
setnames(updating_relations, c("participant", "entity", "weight"))
# Init an object to store per-iteration
# entity labels (i.e. who the pivotal
# shareholder is)
if( save_labels == T) {
iteration_labels <- data.table()
# Otherwise we store overall counts of each
# label being named a pivot
} else {
pivot_counts <- updating_relations[, c("participant", "entity")]
pivot_counts[, times_pivotal := 0 ]
}
# Create a progress bar to report on iteration progess
message(paste0("Iterating through ", iterations, " permutations:"))
iterations_progess_bar <- txtProgressBar(min = 0, max = iterations, style = 3)
for(i in 1:iterations) {
# Randomly order rows within entities
updating_relations[, random_var := runif(.N)]
setorderv(updating_relations, c("entity", "random_var"))
# Breaking cycles when computing NPI: restore original
# labels for the random set of rows with epsilon
# probability
if( powerindex == "npi" & epsilon > 0 ) {
entities_to_restore <- unique(updating_relations[ random_var <= epsilon ]$entity)
if( length(entities_to_restore) > 0 ) {
restored_entities <- relations_object[ get(entity_var) %in% entities_to_restore, c(participant_var, entity_var, weight_var), with = F]
setnames(restored_entities, c("participant", "entity", "weight"))
# Randomly shuffle the labels to restore as well respect random ordering
restored_entities[, random_var := runif(.N) ]
setorderv(restored_entities, c("entity", "random_var"))
updating_relations <- rbind(updating_relations[ !( entity %in% entities_to_restore) ],
restored_entities,
fill = T)
}
}
# Compute cumulative sums of voting rights by entity
updating_relations[, cumulative_weight := cumsum(weight), by = "entity"]
# Lagged cumulative voting share
# shift() is not optimized with many groups
# (https://github.com/Rdatatable/data.table/issues/1534)
# so we cannot simply use `by` statement as below
# updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1), by = "entity"]
#
# Instead, consider a two-step hack that uses our entity ordering
# to mark as missing lagged weights for the first participant per entity
updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1)]
updating_relations[, entity_l1 := shift(entity, type = "lag", n = 1)]
updating_relations[entity != entity_l1, cumulative_weight_l1 := NA_real_ ]
# Identify pivots
pivots <- updating_relations[ cumulative_weight >= quota & (cumulative_weight_l1 < quota | is.na(cumulative_weight_l1) ), c("participant", "entity") ]
# Store pivots from each iteration in a designated object
if( save_labels == T) {
iteration_labels <- rbind(iteration_labels, data.table(iteration = i, pivots), fill = T)
# NB: ^ this is the most problematic step in terms of scaling.
# Where to store and access an N * number of iterations table
# with labels when N is huge and iterations = 20 000? We need
# to save pivots for ~3 million entities 20 thousand times.
#
# Alternatively you can write pivots from each iteration to a separate CSV
# but you will still end up with 20 thousand files, each having ~3 million lines.
}
# We can avoid storing the pivots altogether and just
# add to the total count of times a given partipant is
# named pivotal in a given entity. We also discard
# the first `iterations_to_discard` iterations
if( save_labels == F & i > iterations_to_discard ) {
# We row bind the current pivots to the existing counts
# and then sum. This might be faster than merging on
# entity and participant.
# In fact, it is very slow
pivot_counts <- rbind(pivot_counts, data.table(pivots, times_pivotal = 1), fill = T)
pivot_counts <- pivot_counts[, list( times_pivotal = sum(times_pivotal) ), by = c("entity", "participant")]
}
# When computing NPI: update the participant
# labels with pivots identified during this
# iteration step
if( powerindex == "npi" ) {
updating_relations[, participant_update := pivots[ match(updating_relations$participant, pivots$entity) ]$participant ]
updating_relations[ !is.na(participant_update), participant := participant_update ]
updating_relations[, participant_update := NA ]
# Sum up the updated shares for the next iteration
updating_relations <- updating_relations[, list(weight = sum(weight)), by = c("entity", "participant")]
}
# Update the progress bar
setTxtProgressBar(iterations_progess_bar, i)
}
# Compute the index itself
if( save_labels == T ) {
# Count the number of times this
# participant was named the pivotal player,
# discarding first `iterations_to_discard` iterations
index_values <- iteration_labels[iteration > iterations_to_discard, list(times_pivotal = .N), by = c("entity", "participant")]
} else {
index_values <- pivot_counts
}
# Compute the index
index_values[, index := times_pivotal/(iterations - iterations_to_discard) ]
setnames(index_values, "index", powerindex)
if( save_labels == T ) {
return(list(index_values = index_values, iteration_labels = iteration_labels))
} else {
return(index_values)
}
}