alphaicon/code/alphaicon_paper/6_rank_top_holders.r

431 lines
21 KiB
R
Raw Permalink Normal View History

2021-09-16 08:06:49 +01:00
# This code sums the computed control shares
# of the super-holders in the PSC data and
# produces the rankings under different algorithms
# for the α-ICON paper
library(data.table)
library(lubridate)
library(stringi)
library(stringr)
library(stargazer)
# Declare working directory beforehand in an environment variable
# ALPHAICON_PATH = "path_to_your_folder"
# with the aid of usethis::edit_r_environ()
# Restart R session for the changes to make effect
setwd(Sys.getenv('ALPHAICON_PATH'))
############
# Data loading
# Load the active participants snapshot of PSC (prepared by data_preparation/uk/2_psc_snapshot_to_participants_panel.r)
psc <- fread("output/uk/uk_organisations_participants_2021_long_2aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
# Load the companies data (prepared by data_preparation/uk/1b_process_companies_data.r)
load("data/uk/uk_basic_companies_data_2021-08-01.rdata")
# Load the DPI of each participant after 10000 iterations (computed by alphaicon_paper/2_compute_npi_dpi.r)
holders_dpi <- fread("output/uk/npi_dpi/10000iter/uk_organisations_participants_2021_long_7sep21_dpi_10000iter.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
setnames(holders_dpi, c("entity", "participant", "dpi"), c("company_number", "participant_id", "share"))
holders_dpi[, times_pivotal := NULL ]
# Load the NPI of each participant after 10000 iterations (computed by alphaicon_paper/2_compute_npi_dpi.r)
holders_npi <- fread("output/uk/npi_dpi/10000iter/uk_organisations_participants_2021_long_7sep21_npi_10000iter.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
setnames(holders_npi, c("entity", "participant", "npi"), c("company_number", "participant_id", "share"))
holders_npi[, times_pivotal := NULL ]
# Load the transitive ownership data at various alpha
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
alphas <- c("0.0", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "0.999")
for( a in alphas) {
temp <- fread(paste0("output/uk/transitive/uk_organisations_transitive_ownership_alpha", a, "_2021_long_2aug21.csv"), integer64 = "character", na.strings = "", encoding = "UTF-8", colClasses = c("character", "character", "numeric"))
assign(paste0("holders_transitive_alpha", a), temp)
rm(temp)
}
# Load the data on graph membership
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
graph_membership <- fread("output/uk/uk_organisations_participation_graph_core_periphery_membership_6aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8", header = T, colClasses = c("numeric", "character", "factor"))
graph_membership[, V1 := NULL ]
setnames(graph_membership, "company_number/id", "participant_id")
# All holder objects that are loaded in the memory
holder_objects <- paste0("holders_", c("dpi", "npi", paste0("transitive_alpha", alphas)))
# Add the graph membership and remove self-loops
for(obj in holder_objects) {
get(obj)[, participant_type := graph_membership[ match(get(obj)$participant_id, graph_membership$participant_id) ]$type ]
get(obj)[, company_type := graph_membership[ match(get(obj)$company_number, graph_membership$participant_id) ]$type ]
assign(obj, get(obj)[ company_number != participant_id ])
}
psc[, participant_type := graph_membership[ match(psc$participant_id, graph_membership$participant_id) ]$type ]
psc[, company_type := graph_membership[ match(psc$company_number, graph_membership$participant_id) ]$type ]
############
# Identify top holders
# Compute total holdings (weighted and unweighted by assets)
holdings_dpi <- holders_dpi[, list( sumdpi = sum(share)), by = "participant_id"]
holdings_dpi[, rankdpi := frankv(sumdpi, order = -1, ties.method = "first")]
holdings_npi <- holders_npi[, list( sumnpi = sum(share)), by = "participant_id"]
holdings_npi[, ranknpi := frankv(sumnpi, order = -1, ties.method = "first")]
# Normalize shares by type to one for our transitive data
holders_transitive_alpha0.999[, share_typenormalized := share/sum(share), by = c("company_number", "participant_type")]
# Keep only super-holders and sum up their shares
holdings_transitive <- holders_transitive_alpha0.999[participant_type == "SH", list( sumtransitive = sum(share_typenormalized)), by = "participant_id"]
# Rank the holders by their share
holdings_transitive[, ranktransitive := frankv(sumtransitive, order = -1, ties.method = "first")]
# Top-100 participants by our metric compared to other metrics
top_holders <- setorderv(holdings_transitive, "ranktransitive", 1)[1:100]
# Add ranks from other algorithms
top_holders <- merge(top_holders, holdings_dpi, by = "participant_id", all.x = T, all.y = F, sort = F)
top_holders <- merge(top_holders, holdings_npi, by = "participant_id", all.x = T, all.y = F, sort = F)
# Proper names
## First, match on company number with companies data
top_holders[, participant_name := uk_basic_companies_data[ match(top_holders$participant_id, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
## Second, extract the company name
top_holders[ str_count(participant_id, fixed("$")) == 2, participant_name := stri_split_fixed(participant_id, "$", n = 3, simplify = T)[,1] ]
## Third, add the first and last names of the persons
top_holders[ str_count(participant_id, fixed("$")) == 4, participant_name := paste0(stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 1], " ", stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 3]) ]
## We no longer need participant id
#top_holders[, participant_id := NULL ]
# Proper column order
setcolorder(top_holders, c("participant_name", "participant_id", "ranktransitive", "ranknpi", "rankdpi", "sumtransitive", "sumnpi", "sumdpi"))
# Round the score
top_holders[, c("sumtransitive", "sumnpi", "sumdpi") := lapply(.SD, round, 1), .SDcols = c("sumtransitive", "sumnpi", "sumdpi") ]
# Add the size of the network of holdings and the core-shell distribution
top_holders_network_chars <- data.table()
for(superholder_id in top_holders$participant_id) {
# Debug: superholder_id <- "SC095000"
# Ultimate companies held by this organization
holders_subset_ultimate <- holders_transitive_alpha0.999[ participant_id %in% superholder_id ]
# Consider their total interest in all entities
holders_subset <- psc[ participant_id %in% unique(holders_subset_ultimate$company_number) | company_number %in% unique(holders_subset_ultimate$company_number) ]
holders_subset <- holders_subset[ !is.na(equity_share)]
holders_subset <- holders_subset[ participant_id != company_number]
# Share to 0-1 range
setnames(holders_subset, "equity_share", "share")
holders_subset[, share := share/100]
## Create an object with vertex metadata
vertex_metadata <- unique(rbind(unique(data.table( name = holders_subset$company_number, type = holders_subset$company_type)),
data.table( name = holders_subset$participant_id, type = holders_subset$participant_type)
), by = "name")
# Count all vertex types
out <- vertex_metadata[, .N, by = "type"]
# Transpose
out <- dcast(melt(out, id.vars = "type"), variable ~ type)
out[, variable := NULL ]
# Keep only the columns of interest in the order of interest
types_of_interest <- c("SH", "ST", "I", "C")
names_outside_types_of_interest <- types_of_interest[!(types_of_interest %in% names(out))]
if( length(names_outside_types_of_interest) > 0) {
out[, c(names_outside_types_of_interest) := 0]
}
out <- out[, types_of_interest, with = F]
# Add the network size
out <- data.table(participant_id = superholder_id, nodes = nrow(vertex_metadata), out)
top_holders_network_chars <- rbind(top_holders_network_chars, out, fill = T)
message(superholder_id)
}
# Add to the top holders data
top_holders <- merge(top_holders, top_holders_network_chars, by = "participant_id", sort = F)
# Write to CSV
fwrite(top_holders, file = "output/alphaicon_paper/uk_organisations_top100_holders_2021_long_2aug21.csv")
# Produce a TeX table
#stargazer(top_holders[1:25, -"participant_id"], type = "latex", summary = F, out = "output/alphaicon_paper/uk_organisations_top25_holders_2021_long_2aug21.tex", float = F, multicolumn = T, digits = 1)
# Export the information on top-25 entities with the largest deviation between ranks
combined_holdings <- merge(holdings_dpi, holdings_npi, by = "participant_id", all = T)
combined_holdings <- merge(combined_holdings, holdings_transitive, by = "participant_id", all = T)
combined_holdings[, diff_npi_dpi := sumnpi - sumdpi ]
combined_holdings[, diff_transitive_dpi := sumtransitive - sumdpi ]
combined_holdings[, diff_transitive_npi := sumtransitive - sumnpi ]
top100_holders_diff_npi_dpi <- setorderv(combined_holdings, "diff_npi_dpi", -1, na.last = T)[1:100]
top100_holders_diff_transitive_dpi <- setorderv(combined_holdings, "diff_transitive_dpi", -1, na.last = T)[1:100]
top100_holders_diff_transitive_npi <- setorderv(combined_holdings, "diff_transitive_npi", -1, na.last = T)[1:100]
# Export point
#fwrite(top100_holders_diff_npi_dpi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_npi_dpi_2021_long_2aug21.csv")
#fwrite(top100_holders_diff_transitive_dpi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_dpi_2021_long_2aug21.csv")
#fwrite(top100_holders_diff_transitive_npi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_npi_2021_long_2aug21.csv")
############
# Agreement between rankings
# Add the ranks (takes time)
for(obj in holder_objects) {
# Create temporary variable prioritising super-holders
if( grepl("transitive", obj) ) {
get(obj)[, superholder := 0]
get(obj)[participant_type == "SH", superholder := 1]
}
# Sort by descending share within companies
if( grepl("transitive", obj) ) {
setorderv(get(obj), c("company_number", "superholder", "share"), c(1, -1, -1))
} else {
setorderv(get(obj), c("company_number", "share"), c(1, -1))
}
# Add the rank that prioritises super-holders
get(obj)[, rank := 1:.N, by = c("company_number")]
message(obj)
}
# Combine the ranks from all the methods
vars_to_keep <- c("company_number", "participant_id", "rank")
## DPI and NPI
holders_rankings <- merge(holders_dpi[, c(vars_to_keep), with = F], holders_npi[, c(vars_to_keep), with = F], by = c("company_number", "participant_id"), all = T)
setnames(holders_rankings, c("rank.x", "rank.y"), c("rank_dpi", "rank_npi"))
## Add transitive holder hankings (only super-holders or top-ranking non-super-holders)
for( a in alphas) {
holders_rankings <- merge(holders_rankings, get(paste0("holders_transitive_alpha", a))[ participant_type == "SH" | (participant_type != "SH" & rank == 1), c(vars_to_keep), with = F], by = c("company_number", "participant_id"), all = T)
setnames(holders_rankings, c("rank"), paste0("rank_transitive", a))
}
# Count holders by company from each method
holders_rankings[, countholders_dpi := sum(!is.na(rank_dpi)), by = "company_number"]
holders_rankings[, countholders_npi := sum(!is.na(rank_npi)), by = "company_number"]
for( a in alphas) {
holders_rankings[, paste0("countholders_transitive", a) := sum(!is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
message(a)
}
# Pairs of counts of holders by company
holders_rankings[, countholders_dpi_npi := sum(!is.na(rank_dpi) & !is.na(rank_npi)), by = "company_number"]
for( a in alphas) {
holders_rankings[, paste0("countholders_dpi_transitive", a) := sum(!is.na(rank_dpi) & !is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
holders_rankings[, paste0("countholders_npi_transitive", a) := sum(!is.na(rank_npi) & !is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
message(a)
}
## Compute company-level Kendal's tau-b rank correlation coefficient
### (takes about 15 minutes per pair)
tau_dpi_npi <- holders_rankings[ countholders_dpi_npi > 1 & !is.na(rank_dpi) & !is.na(rank_npi), list(kendalltau_dpi_npi = cor(rank_dpi, rank_npi, method = "kendall", use = "pairwise.complete.obs")), by = "company_number"]
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- holders_rankings[ countholders_dpi_transitive", a, " > 1 & !is.na(rank_dpi) & !is.na(rank_transitive", a, "), list(kendalltau_dpi_transitive", a, " = cor(rank_dpi, rank_transitive", a, ", method = 'kendall', use = 'pairwise.complete.obs')), by = 'company_number']")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- holders_rankings[ countholders_npi_transitive", a, " > 1 & !is.na(rank_npi) & !is.na(rank_transitive", a, "), list(kendalltau_npi_transitive", a, " = cor(rank_npi, rank_transitive", a, ", method = 'kendall', use = 'pairwise.complete.obs')), by = 'company_number']")))
message(a)
}
# Examples of companies with complex rankings
#holders_rankings[ company_number == "00026306"]
#holders_rankings[ company_number == "00000529"]
#holders_rankings[ company_number == "00018751"]
# Debug: these should be zeros
#holders_rankings[ countholders_dpi == 1 & countholders_npi == 1 & rank_dpi != rank_npi]
#holders_rankings[ countholders_transitive0.999 == 1 & countholders_npi == 1 & rank_transitive0.999 != rank_npi]
# Add the tau-b = 1 for companies with one participant (both direct and indirect)
tau_dpi_npi <- rbind(tau_dpi_npi, data.table(company_number = unique(holders_rankings[ countholders_dpi == 1 & countholders_npi == 1 & !is.na(rank_dpi) & !is.na(rank_npi) ]$company_number), kendalltau_dpi_npi = 1))
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- rbind(tau_dpi_transitive", a, ", data.table( company_number = unique(holders_rankings[ countholders_dpi == 1 & countholders_transitive", a, " == 1 & !is.na(rank_dpi) & !is.na(rank_transitive", a, ")]$company_number), kendalltau_dpi_transitive", a, " = 1))")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- rbind(tau_npi_transitive", a, ", data.table( company_number = unique(holders_rankings[ countholders_npi == 1 & countholders_transitive", a, " == 1 & !is.na(rank_npi) & !is.na(rank_transitive", a, ")]$company_number), kendalltau_npi_transitive", a, " = 1))")))
}
# Add the tau-b = -1 for companies with differing number of participants
# where ranks do not match. Example: holders_rankings[ company_number == "00026306"]
tau_dpi_npi <- rbind(tau_dpi_npi, data.table(company_number = unique(holders_rankings[ !(company_number %in% tau_dpi_npi$company_number)]$company_number), kendalltau_dpi_npi = -1))
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- rbind(tau_dpi_transitive", a, ", data.table( company_number = unique(holders_rankings[ !(company_number %in% tau_dpi_transitive", a, "$company_number) ]$company_number), kendalltau_dpi_transitive", a, " = -1))")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- rbind(tau_npi_transitive", a, ", data.table( company_number = unique(holders_rankings[ !(company_number %in% tau_npi_transitive", a, "$company_number) ]$company_number), kendalltau_npi_transitive", a, " = -1))")))
}
# Finally, we need to fill in NAs like tau_dpi_transitive0.999[company_number == "00001615"]
# holders_rankings[ company_number == "00001615" ]
# holders_rankings[ company_number == "SO307326" ]
# holders_rankings[ company_number == "00000529" ]
# Merge the taus
for( a in alphas) {
if( which(alphas == a) == 1 ) {
kendall_taus_participant_ranks <- merge(tau_dpi_npi, get(paste0("tau_dpi_transitive", a)), by = "company_number", all = T)
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_npi_transitive", a)), by = "company_number", all = T)
} else {
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_dpi_transitive", a)), by = "company_number", all = T)
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_npi_transitive", a)), by = "company_number", all = T)
}
message(a)
}
# Add the information on the count of holders
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, unique(holders_rankings[, c("company_number", "countholders_dpi", "countholders_npi", paste0("countholders_transitive", alphas)), with = F], by = "company_number"), by = "company_number", all.x = T, all.y = F)
# Save point
fwrite(kendall_taus_participant_ranks, file = "output/alphaicon_paper/kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
############
# Analyse the agreement between rankings
library(data.table)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(stringi)
library(stringr)
library(stargazer)
library(showtext)
setwd(Sys.getenv('ALPHAICON_PATH'))
# Add the font to use
font_add_google("Open Sans", "Open Sans")
showtext_auto()
# Load the produced CSV with Kendall's tau-b
taus_longs <- fread("output/alphaicon_paper/kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv", encoding = "UTF-8")
# To long form (each line is method and alpha)
taus <- melt(taus_longs, id.vars = "company_number", measure.vars = names(taus_longs)[grepl("kendalltau", names(taus_longs))], variable.name = "pair", value.name = "tau", na.rm = T, variable.factor = T)
countholders <- melt(taus_longs, id.vars = "company_number", measure.vars = names(taus_longs)[grepl("countholders", names(taus_longs))], variable.name = "algorithm", value.name = "countholders", na.rm = T, variable.factor = T)
# Convert pairs to extract useful information
alg_pairs <- data.table(pair = levels(taus$pair))
alg_pairs[, pair_modif := gsub("kendalltau_", "", pair, fixed = T)]
alg_pairs[, c("algorithm1", "algorithm2") := as.data.table(stri_split_fixed(pair_modif, "_", n = 2, simplify = T))]
alg_pairs[, pair_modif := NULL]
alg_pairs[, pair := factor(pair, levels = levels(taus$pair))]
alg_pairs[, alpha := NA_real_ ]
alg_pairs[ grepl("transitive", algorithm2), alpha := as.numeric(gsub("transitive", "", algorithm2))]
alg_pairs[ grepl("transitive", algorithm2), algorithm2 := gsub("[^transitive]", "", algorithm2)]
alg_pairs[, algorithm1 := factor(algorithm1, levels = c("dpi", "npi", "transitive"))]
alg_pairs[, algorithm2 := factor(algorithm2, levels = c("dpi", "npi", "transitive"))]
taus <- merge(taus, alg_pairs, by = "pair")
taus[, pair := NULL]
gc()
# Same for countholders
alg_counts <- data.table(algorithm = levels(countholders$algorithm))
alg_counts[, alg_modif := gsub("countholders_", "", algorithm)]
alg_counts[ grepl("transitive", alg_modif), alpha := as.numeric(gsub("transitive", "", alg_modif))]
alg_counts[ grepl("transitive", alg_modif), alg_modif := gsub("[^transitive]", "", alg_modif)]
alg_counts[, alg_modif := factor(alg_modif, levels = c("dpi", "npi", "transitive"))]
countholders <- merge(countholders, alg_counts, by = "algorithm")
countholders[, algorithm := NULL]
setnames(countholders, "alg_modif", "algorithm")
gc()
# Add count holders to tau
#taus <- merge(taus, countholders, by.x = c("algorithm1", "alpha"), by.y = c("algorithm", "alpha"))
#setnames(taus, "countholders", "countholders1")
#taus <- merge(taus, countholders, by.x = c("algorithm2", "alpha"), by.y = c("algorithm", "alpha"))
#setnames(taus, "countholders", "countholders2")
# Add entity industry to the data (peak use of 40 GB)
taus[, industrysection := uk_basic_companies_data[ match(taus$company_number, uk_basic_companies_data$CompanyNumber)]$industrysection_1 ]
gc()
# Summary statistics of Kendall's tau-b
## Mean, overall
taus_overall_mean <- taus[!is.na(tau), list(tau = mean(tau), tausemean = sd(tau)/sqrt(.N)), by = c("algorithm1", "algorithm2", "alpha")]
## Mean, by industry
taus_industry_mean <- taus[!is.na(tau) & !is.na(industrysection), list( tau = mean(tau), tausemean = sd(tau)/sqrt(.N)), by = c("algorithm1", "algorithm2", "alpha", "industrysection")]
# Export point
#fwrite(taus_overall_mean, file = "output/alphaicon_paper/mean_kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
#fwrite(taus_industry_mean, file = "output/alphaicon_paper/mean_by_industrysection_kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
# Produce the plot with overall mean
taus_overall_mean[ grepl("dpi|npi", algorithm1), algorithm1 := toupper(algorithm1)]
## Other algorithms vs best and worst-performing transitive algorithm
taus_mean_plot <- ggplot(aes(x = alpha, y = tau, group = algorithm1, color = algorithm1), data = taus_overall_mean[ algorithm1 == "NPI" & algorithm2 == "transitive"]) +
geom_line(size = 1.5, alpha = 1) +
geom_point(size = 2) +
geom_text(aes(label = round(tau, 4)), size = 2.5, nudge_y = 0.0005) +
scale_x_continuous( breaks = c(seq(0, 0.9, 0.1), 0.999), labels = c(seq(0, 0.9, 0.1), 0.999), guide = guide_axis(n.dodge = 1)) +
#facet_wrap(. ~ algorithm1, scales = "free_y")+
labs(y = expression("Kendall's "~tau~"-b betw. NPI\nand "~alpha~"-ICON SH ranks"), x = expression(alpha)) +
scale_colour_brewer(type = "qual", palette = 2) +
theme_minimal() + theme(legend.position = "none", text = element_text(size = 14, family = "Open Sans"), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
#ggsave(taus_mean_plot, file = "output/alphaicon_paper/taus_mean_plot.pdf", width = 8.5, height = 4, device = cairo_pdf, scale = 1)
# Export to a table
#stargazer(taus_overall_mean[ algorithm1 == "NPI" & algorithm2 == "transitive"], type = "latex", summary = F, out = "output/alphaicon_paper/taus_overall_mean.tex", float = F, multicolumn = T, digits = 5)