alphaicon/code/helper_functions/compute_power_index.r

# This code implements Algorithm 1
# "Label propagation for NPI"
# from Mizuno, Doi, and Kurizaki (2020, p. 7)
# (https://doi.org/10.1371/journal.pone.0237862)
library(data.table)

# Function to compute Network or Direct power index
# INPUTS: a data.table with participant-entity relations
# in long form or an igraph object with edge weights
# OUTPUTS: data.table with power index for each participant
#	or named list with such table and labels from each iteration
# 	(if save_labels = T).

compute_power_index <- function(relations_object,
								participant_var = "participant",
								entity_var = "entity",
								weight_var = "weight",
								powerindex = "dpi", # which power index to compute
								iterations = 100, # how many iterations to run
								quota = 50, # majority rule, % 
								epsilon = 0.01, # probability to restore the original label when computing the NPI
								iterations_to_discard = 20, # how many iterations to discard when computing the NPI
								save_labels = FALSE # whether to save labels from each iteration
								) {

	# Debug: powerindex <- "dpi"; iterations <- 100; quota <- 50; epsilon <- 0.01; iterations_to_discard <- 20; save_labels <- F; participant_var = "participant"; entity_var = "entity"; weight_var = "voting_share"

	# Input checks
	## Does the relations object exist and is it in correct form?
	stopifnot("relations_object is not specified" = !missing(relations_object))

	## If the relations object is in igraph transform to a data.frame
	if( "igraph" %in% class(relations_object) ) {
		
		relations_object <- igraph::as_data_frame(relations_object, what = "edges")

		# Check whether a weight column is present in the data (i.e. data.frame has at least 3 columns
		stopifnot("`relations_object` graph does not have edge weights" = ncol(relations_object) >= 3)

		# Proper names
		names(relations_object)[1:2] <- c(participant_var, entity_var)

	}

	## If the relations object is not a data.table try to coerce to it
	if( !("data.table" %in% class(relations_object)) ) {

		data.table::setDT(relations_object)

	}

	## Are the participant and entity variables in the data?
	stopifnot("`participant_var` variable not found in `relations_object`" = participant_var %in% names(relations_object))
	stopifnot("`entity_var` variable not found in `relations_object`" = entity_var %in% names(relations_object))
	stopifnot("`weight_var` variable not found in `relations_object`" = weight_var %in% names(relations_object))

	## Are the power index options correctly specified?
	stopifnot("`powerindex` should be either dpi or npi" = powerindex %in% c("npi", "dpi"))
	stopifnot("`save_labels` should be either TRUE or FALSE" = save_labels %in% c(TRUE, FALSE))

	## Are the parameters of the required type?
	stopifnot("`iterations should` be an integer" = all.equal(iterations, as.integer(iterations)) )
	stopifnot("`iterations_to_discard` should be an integer not exceeding `iterations`" = (all.equal(iterations_to_discard, as.integer(iterations_to_discard)) & iterations_to_discard < iterations) | powerindex == "dpi" )
	min_weight <- min(relations_object[[weight_var]], na.rm = T)
	max_weight <- max(relations_object[[weight_var]], na.rm = T)
	stopifnot("`weight_var` should take non-negative numeric values" = min_weight >= 0 & is.finite(max_weight) & is.finite(min_weight) )
	stopifnot("epsilon should be in [0,1] range" = (epsilon >= 0 & epsilon <= 1) | powerindex == "dpi" )

	## Disable the `iterations_to_discard` option
	## when computing DPI
	if( powerindex == "dpi" ) {

		iterations_to_discard <- 0
		
	}

	# Remove observations with missing data
	## Participant
	if( nrow(relations_object[ is.na(get(participant_var)) ]) > 0 ) {
		
		warning(paste0("Removing rows with missing ", participant_var, " before computation"))
		relations_object <- relations_object[ !is.na(get(participant_var)) ]

	}
	## Entity
	if( nrow(relations_object[ is.na(get(entity_var)) ]) > 0 ) {
		
		warning(paste0("Removing rows with missing ", entity_var, " before computation"))
		relations_object <- relations_object[ !is.na(get(entity_var)) ]

	}
	## Weight
	if( nrow(relations_object[ is.na(get(weight_var)) ]) > 0 ) {
		
		warning(paste0("Removing rows with missing ", weight_var, " before computation"))
		relations_object <- relations_object[ !is.na(get(weight_var)) ]

	}

	# Init an object to update labels in each iteration
	updating_relations <- copy(relations_object[, c(participant_var, entity_var, weight_var), with = F])
	setnames(updating_relations, c("participant", "entity", "weight"))

	# Init an object to store per-iteration
	# entity labels (i.e. who the pivotal
	# shareholder is)
	if( save_labels == T) {
	
		iteration_labels <- data.table()
	
	# Otherwise we store overall counts of each
	# label being named a pivot
	} else {
	
		pivot_counts <- updating_relations[, c("participant", "entity")]
		pivot_counts[, times_pivotal := 0 ]

	}

	# Create a progress bar to report on iteration progess
	message(paste0("Iterating through ", iterations, " permutations:"))
	iterations_progess_bar <- txtProgressBar(min = 0, max = iterations, style = 3)

	for(i in 1:iterations) {
	
		# Randomly order rows within entities
		updating_relations[, random_var := runif(.N)]
		setorderv(updating_relations, c("entity", "random_var"))
	
		# Breaking cycles when computing NPI: restore original
		# labels for the random set of rows with epsilon
		# probability
		if( powerindex == "npi" & epsilon > 0 ) {

			entities_to_restore <- unique(updating_relations[ random_var <= epsilon ]$entity)
	
			if( length(entities_to_restore) > 0 ) {

				restored_entities <- relations_object[ get(entity_var) %in% entities_to_restore, c(participant_var, entity_var, weight_var), with = F]
				setnames(restored_entities, c("participant", "entity", "weight"))

				# Randomly shuffle the labels to restore as well respect random ordering
				restored_entities[, random_var := runif(.N) ]
				setorderv(restored_entities, c("entity", "random_var"))

				updating_relations <- rbind(updating_relations[ !( entity %in% entities_to_restore) ], 
								restored_entities,
								fill = T)

			}

		}
		# Compute cumulative sums of voting rights by entity
		updating_relations[, cumulative_weight := cumsum(weight), by = "entity"]
	
		# Lagged cumulative voting share
		# shift() is not optimized with many groups
		# (https://github.com/Rdatatable/data.table/issues/1534)
		# so we cannot simply use `by` statement as below 
		# updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1), by = "entity"]
		#
		# Instead, consider a two-step hack that uses our entity ordering
		# to mark as missing lagged weights for the first participant per entity
		updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1)]
		updating_relations[, entity_l1 := shift(entity, type = "lag", n = 1)]
		updating_relations[entity != entity_l1, cumulative_weight_l1 := NA_real_ ]

		# Identify pivots
		pivots <- updating_relations[ cumulative_weight >= quota & (cumulative_weight_l1 < quota | is.na(cumulative_weight_l1) ), c("participant", "entity") ]
	
		# Store pivots from each iteration in a designated object
		if( save_labels == T) {
	
			iteration_labels <- rbind(iteration_labels, data.table(iteration = i, pivots), fill = T)
	
			# NB: ^ this is the most problematic step in terms of scaling.
			# Where to store and access an N * number of iterations table
			# with labels when N is huge and iterations = 20 000? We need
			# to save pivots for ~3 million entities 20 thousand times.
			#
			# Alternatively you can write pivots from each iteration to a separate CSV
			# but you will still end up with 20 thousand files, each having ~3 million lines.
		
		}
	
		# We can avoid storing the pivots altogether and just
		# add to the total count of times a given partipant is
		# named pivotal in a given entity. We also discard
		# the first `iterations_to_discard` iterations
		if( save_labels == F & i > iterations_to_discard ) {
	
			# We row bind the current pivots to the existing counts
			# and then sum. This might be faster than merging on
			# entity and participant.
			# In fact, it is very slow
			pivot_counts <- rbind(pivot_counts, data.table(pivots, times_pivotal = 1), fill = T)
			pivot_counts <- pivot_counts[, list( times_pivotal = sum(times_pivotal) ), by = c("entity", "participant")]
	
		}
	
		# When computing NPI: update the participant
		# labels with pivots identified during this
		# iteration step
		if( powerindex == "npi" ) {

			updating_relations[, participant_update := pivots[ match(updating_relations$participant, pivots$entity) ]$participant ]
			updating_relations[ !is.na(participant_update), participant := participant_update ]
			updating_relations[, participant_update := NA ]

			# Sum up the updated shares for the next iteration
			updating_relations <- updating_relations[, list(weight = sum(weight)), by = c("entity", "participant")]
	
		}

		# Update the progress bar
		setTxtProgressBar(iterations_progess_bar, i)
	
	}

	# Compute the index itself
	if( save_labels == T ) {
	
		# Count the number of times this
		# participant was named the pivotal player,
		# discarding first `iterations_to_discard` iterations
		index_values <- iteration_labels[iteration > iterations_to_discard, list(times_pivotal = .N), by = c("entity", "participant")]
	
	} else {
	
		index_values <- pivot_counts
	
	}

	# Compute the index
	index_values[, index := times_pivotal/(iterations - iterations_to_discard) ]
	setnames(index_values, "index", powerindex)

	if( save_labels == T ) {

		return(list(index_values = index_values, iteration_labels = iteration_labels))

	} else {

		return(index_values)

	}

}
Initial commit 2021-09-16 08:06:49 +01:00			`# This code implements Algorithm 1`
			`# "Label propagation for NPI"`
			`# from Mizuno, Doi, and Kurizaki (2020, p. 7)`
			`# (https://doi.org/10.1371/journal.pone.0237862)`
			`library(data.table)`

			`# Function to compute Network or Direct power index`
			`# INPUTS: a data.table with participant-entity relations`
			`# in long form or an igraph object with edge weights`
			`# OUTPUTS: data.table with power index for each participant`
			`# or named list with such table and labels from each iteration`
			`# (if save_labels = T).`

			`compute_power_index <- function(relations_object,`
			`participant_var = "participant",`
			`entity_var = "entity",`
			`weight_var = "weight",`
			`powerindex = "dpi", # which power index to compute`
			`iterations = 100, # how many iterations to run`
			`quota = 50, # majority rule, %`
			`epsilon = 0.01, # probability to restore the original label when computing the NPI`
			`iterations_to_discard = 20, # how many iterations to discard when computing the NPI`
			`save_labels = FALSE # whether to save labels from each iteration`
			`) {`

			`# Debug: powerindex <- "dpi"; iterations <- 100; quota <- 50; epsilon <- 0.01; iterations_to_discard <- 20; save_labels <- F; participant_var = "participant"; entity_var = "entity"; weight_var = "voting_share"`

			`# Input checks`
			`## Does the relations object exist and is it in correct form?`
			`stopifnot("relations_object is not specified" = !missing(relations_object))`

			`## If the relations object is in igraph transform to a data.frame`
			`if( "igraph" %in% class(relations_object) ) {`

			`relations_object <- igraph::as_data_frame(relations_object, what = "edges")`

			`# Check whether a weight column is present in the data (i.e. data.frame has at least 3 columns`
			stopifnot("`relations_object` graph does not have edge weights" = ncol(relations_object) >= 3)

			`# Proper names`
			`names(relations_object)[1:2] <- c(participant_var, entity_var)`

			`}`

			`## If the relations object is not a data.table try to coerce to it`
			`if( !("data.table" %in% class(relations_object)) ) {`

			`data.table::setDT(relations_object)`

			`}`

			`## Are the participant and entity variables in the data?`
			stopifnot("`participant_var` variable not found in `relations_object`" = participant_var %in% names(relations_object))
			stopifnot("`entity_var` variable not found in `relations_object`" = entity_var %in% names(relations_object))
			stopifnot("`weight_var` variable not found in `relations_object`" = weight_var %in% names(relations_object))

			`## Are the power index options correctly specified?`
			stopifnot("`powerindex` should be either dpi or npi" = powerindex %in% c("npi", "dpi"))
			stopifnot("`save_labels` should be either TRUE or FALSE" = save_labels %in% c(TRUE, FALSE))

			`## Are the parameters of the required type?`
			stopifnot("`iterations should` be an integer" = all.equal(iterations, as.integer(iterations)) )
			stopifnot("`iterations_to_discard` should be an integer not exceeding `iterations`" = (all.equal(iterations_to_discard, as.integer(iterations_to_discard)) & iterations_to_discard < iterations) \| powerindex == "dpi" )
			`min_weight <- min(relations_object[[weight_var]], na.rm = T)`
			`max_weight <- max(relations_object[[weight_var]], na.rm = T)`
			stopifnot("`weight_var` should take non-negative numeric values" = min_weight >= 0 & is.finite(max_weight) & is.finite(min_weight) )
			`stopifnot("epsilon should be in [0,1] range" = (epsilon >= 0 & epsilon <= 1) \| powerindex == "dpi" )`

			## Disable the `iterations_to_discard` option
			`## when computing DPI`
			`if( powerindex == "dpi" ) {`

			`iterations_to_discard <- 0`

			`}`

			`# Remove observations with missing data`
			`## Participant`
			`if( nrow(relations_object[ is.na(get(participant_var)) ]) > 0 ) {`

			`warning(paste0("Removing rows with missing ", participant_var, " before computation"))`
			`relations_object <- relations_object[ !is.na(get(participant_var)) ]`

			`}`
			`## Entity`
			`if( nrow(relations_object[ is.na(get(entity_var)) ]) > 0 ) {`

			`warning(paste0("Removing rows with missing ", entity_var, " before computation"))`
			`relations_object <- relations_object[ !is.na(get(entity_var)) ]`

			`}`
			`## Weight`
			`if( nrow(relations_object[ is.na(get(weight_var)) ]) > 0 ) {`

			`warning(paste0("Removing rows with missing ", weight_var, " before computation"))`
			`relations_object <- relations_object[ !is.na(get(weight_var)) ]`

			`}`

			`# Init an object to update labels in each iteration`
			`updating_relations <- copy(relations_object[, c(participant_var, entity_var, weight_var), with = F])`
			`setnames(updating_relations, c("participant", "entity", "weight"))`

			`# Init an object to store per-iteration`
			`# entity labels (i.e. who the pivotal`
			`# shareholder is)`
			`if( save_labels == T) {`

			`iteration_labels <- data.table()`

			`# Otherwise we store overall counts of each`
			`# label being named a pivot`
			`} else {`

			`pivot_counts <- updating_relations[, c("participant", "entity")]`
			`pivot_counts[, times_pivotal := 0 ]`

			`}`

			`# Create a progress bar to report on iteration progess`
			`message(paste0("Iterating through ", iterations, " permutations:"))`
			`iterations_progess_bar <- txtProgressBar(min = 0, max = iterations, style = 3)`

			`for(i in 1:iterations) {`

			`# Randomly order rows within entities`
			`updating_relations[, random_var := runif(.N)]`
			`setorderv(updating_relations, c("entity", "random_var"))`

			`# Breaking cycles when computing NPI: restore original`
			`# labels for the random set of rows with epsilon`
			`# probability`
			`if( powerindex == "npi" & epsilon > 0 ) {`

			`entities_to_restore <- unique(updating_relations[ random_var <= epsilon ]$entity)`

			`if( length(entities_to_restore) > 0 ) {`

			`restored_entities <- relations_object[ get(entity_var) %in% entities_to_restore, c(participant_var, entity_var, weight_var), with = F]`
			`setnames(restored_entities, c("participant", "entity", "weight"))`

			`# Randomly shuffle the labels to restore as well respect random ordering`
			`restored_entities[, random_var := runif(.N) ]`
			`setorderv(restored_entities, c("entity", "random_var"))`

			`updating_relations <- rbind(updating_relations[ !( entity %in% entities_to_restore) ],`
			`restored_entities,`
			`fill = T)`

			`}`

			`}`
			`# Compute cumulative sums of voting rights by entity`
			`updating_relations[, cumulative_weight := cumsum(weight), by = "entity"]`

			`# Lagged cumulative voting share`
			`# shift() is not optimized with many groups`
			`# (https://github.com/Rdatatable/data.table/issues/1534)`
			# so we cannot simply use `by` statement as below
			`# updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1), by = "entity"]`
			`#`
			`# Instead, consider a two-step hack that uses our entity ordering`
			`# to mark as missing lagged weights for the first participant per entity`
			`updating_relations[, cumulative_weight_l1 := shift(cumulative_weight, type = "lag", n = 1)]`
			`updating_relations[, entity_l1 := shift(entity, type = "lag", n = 1)]`
			`updating_relations[entity != entity_l1, cumulative_weight_l1 := NA_real_ ]`

			`# Identify pivots`
			`pivots <- updating_relations[ cumulative_weight >= quota & (cumulative_weight_l1 < quota \| is.na(cumulative_weight_l1) ), c("participant", "entity") ]`

			`# Store pivots from each iteration in a designated object`
			`if( save_labels == T) {`

			`iteration_labels <- rbind(iteration_labels, data.table(iteration = i, pivots), fill = T)`

			`# NB: ^ this is the most problematic step in terms of scaling.`
			`# Where to store and access an N * number of iterations table`
			`# with labels when N is huge and iterations = 20 000? We need`
			`# to save pivots for ~3 million entities 20 thousand times.`
			`#`
			`# Alternatively you can write pivots from each iteration to a separate CSV`
			`# but you will still end up with 20 thousand files, each having ~3 million lines.`

			`}`

			`# We can avoid storing the pivots altogether and just`
			`# add to the total count of times a given partipant is`
			`# named pivotal in a given entity. We also discard`
			# the first `iterations_to_discard` iterations
			`if( save_labels == F & i > iterations_to_discard ) {`

			`# We row bind the current pivots to the existing counts`
			`# and then sum. This might be faster than merging on`
			`# entity and participant.`
			`# In fact, it is very slow`
			`pivot_counts <- rbind(pivot_counts, data.table(pivots, times_pivotal = 1), fill = T)`
			`pivot_counts <- pivot_counts[, list( times_pivotal = sum(times_pivotal) ), by = c("entity", "participant")]`

			`}`

			`# When computing NPI: update the participant`
			`# labels with pivots identified during this`
			`# iteration step`
			`if( powerindex == "npi" ) {`

			`updating_relations[, participant_update := pivots[ match(updating_relations$participant, pivots$entity) ]$participant ]`
			`updating_relations[ !is.na(participant_update), participant := participant_update ]`
			`updating_relations[, participant_update := NA ]`

			`# Sum up the updated shares for the next iteration`
			`updating_relations <- updating_relations[, list(weight = sum(weight)), by = c("entity", "participant")]`

			`}`

			`# Update the progress bar`
			`setTxtProgressBar(iterations_progess_bar, i)`

			`}`

			`# Compute the index itself`
			`if( save_labels == T ) {`

			`# Count the number of times this`
			`# participant was named the pivotal player,`
			# discarding first `iterations_to_discard` iterations
			`index_values <- iteration_labels[iteration > iterations_to_discard, list(times_pivotal = .N), by = c("entity", "participant")]`

			`} else {`

			`index_values <- pivot_counts`

			`}`

			`# Compute the index`
			`index_values[, index := times_pivotal/(iterations - iterations_to_discard) ]`
			`setnames(index_values, "index", powerindex)`

			`if( save_labels == T ) {`

			`return(list(index_values = index_values, iteration_labels = iteration_labels))`

			`} else {`

			`return(index_values)`

			`}`

			`}`