alphaicon/code/alphaicon_paper/6_rank_top_holders.r
Dmitriy Skougarevskiy 3b419bab6e Initial commit
2021-09-16 10:06:49 +03:00

431 lines
21 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This code sums the computed control shares
# of the super-holders in the PSC data and
# produces the rankings under different algorithms
# for the α-ICON paper
library(data.table)
library(lubridate)
library(stringi)
library(stringr)
library(stargazer)
# Declare working directory beforehand in an environment variable
# ALPHAICON_PATH = "path_to_your_folder"
# with the aid of usethis::edit_r_environ()
# Restart R session for the changes to make effect
setwd(Sys.getenv('ALPHAICON_PATH'))
############
# Data loading
# Load the active participants snapshot of PSC (prepared by data_preparation/uk/2_psc_snapshot_to_participants_panel.r)
psc <- fread("output/uk/uk_organisations_participants_2021_long_2aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
# Load the companies data (prepared by data_preparation/uk/1b_process_companies_data.r)
load("data/uk/uk_basic_companies_data_2021-08-01.rdata")
# Load the DPI of each participant after 10000 iterations (computed by alphaicon_paper/2_compute_npi_dpi.r)
holders_dpi <- fread("output/uk/npi_dpi/10000iter/uk_organisations_participants_2021_long_7sep21_dpi_10000iter.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
setnames(holders_dpi, c("entity", "participant", "dpi"), c("company_number", "participant_id", "share"))
holders_dpi[, times_pivotal := NULL ]
# Load the NPI of each participant after 10000 iterations (computed by alphaicon_paper/2_compute_npi_dpi.r)
holders_npi <- fread("output/uk/npi_dpi/10000iter/uk_organisations_participants_2021_long_7sep21_npi_10000iter.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
setnames(holders_npi, c("entity", "participant", "npi"), c("company_number", "participant_id", "share"))
holders_npi[, times_pivotal := NULL ]
# Load the transitive ownership data at various alpha
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
alphas <- c("0.0", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "0.999")
for( a in alphas) {
temp <- fread(paste0("output/uk/transitive/uk_organisations_transitive_ownership_alpha", a, "_2021_long_2aug21.csv"), integer64 = "character", na.strings = "", encoding = "UTF-8", colClasses = c("character", "character", "numeric"))
assign(paste0("holders_transitive_alpha", a), temp)
rm(temp)
}
# Load the data on graph membership
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
graph_membership <- fread("output/uk/uk_organisations_participation_graph_core_periphery_membership_6aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8", header = T, colClasses = c("numeric", "character", "factor"))
graph_membership[, V1 := NULL ]
setnames(graph_membership, "company_number/id", "participant_id")
# All holder objects that are loaded in the memory
holder_objects <- paste0("holders_", c("dpi", "npi", paste0("transitive_alpha", alphas)))
# Add the graph membership and remove self-loops
for(obj in holder_objects) {
get(obj)[, participant_type := graph_membership[ match(get(obj)$participant_id, graph_membership$participant_id) ]$type ]
get(obj)[, company_type := graph_membership[ match(get(obj)$company_number, graph_membership$participant_id) ]$type ]
assign(obj, get(obj)[ company_number != participant_id ])
}
psc[, participant_type := graph_membership[ match(psc$participant_id, graph_membership$participant_id) ]$type ]
psc[, company_type := graph_membership[ match(psc$company_number, graph_membership$participant_id) ]$type ]
############
# Identify top holders
# Compute total holdings (weighted and unweighted by assets)
holdings_dpi <- holders_dpi[, list( sumdpi = sum(share)), by = "participant_id"]
holdings_dpi[, rankdpi := frankv(sumdpi, order = -1, ties.method = "first")]
holdings_npi <- holders_npi[, list( sumnpi = sum(share)), by = "participant_id"]
holdings_npi[, ranknpi := frankv(sumnpi, order = -1, ties.method = "first")]
# Normalize shares by type to one for our transitive data
holders_transitive_alpha0.999[, share_typenormalized := share/sum(share), by = c("company_number", "participant_type")]
# Keep only super-holders and sum up their shares
holdings_transitive <- holders_transitive_alpha0.999[participant_type == "SH", list( sumtransitive = sum(share_typenormalized)), by = "participant_id"]
# Rank the holders by their share
holdings_transitive[, ranktransitive := frankv(sumtransitive, order = -1, ties.method = "first")]
# Top-100 participants by our metric compared to other metrics
top_holders <- setorderv(holdings_transitive, "ranktransitive", 1)[1:100]
# Add ranks from other algorithms
top_holders <- merge(top_holders, holdings_dpi, by = "participant_id", all.x = T, all.y = F, sort = F)
top_holders <- merge(top_holders, holdings_npi, by = "participant_id", all.x = T, all.y = F, sort = F)
# Proper names
## First, match on company number with companies data
top_holders[, participant_name := uk_basic_companies_data[ match(top_holders$participant_id, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
## Second, extract the company name
top_holders[ str_count(participant_id, fixed("$")) == 2, participant_name := stri_split_fixed(participant_id, "$", n = 3, simplify = T)[,1] ]
## Third, add the first and last names of the persons
top_holders[ str_count(participant_id, fixed("$")) == 4, participant_name := paste0(stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 1], " ", stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 3]) ]
## We no longer need participant id
#top_holders[, participant_id := NULL ]
# Proper column order
setcolorder(top_holders, c("participant_name", "participant_id", "ranktransitive", "ranknpi", "rankdpi", "sumtransitive", "sumnpi", "sumdpi"))
# Round the score
top_holders[, c("sumtransitive", "sumnpi", "sumdpi") := lapply(.SD, round, 1), .SDcols = c("sumtransitive", "sumnpi", "sumdpi") ]
# Add the size of the network of holdings and the core-shell distribution
top_holders_network_chars <- data.table()
for(superholder_id in top_holders$participant_id) {
# Debug: superholder_id <- "SC095000"
# Ultimate companies held by this organization
holders_subset_ultimate <- holders_transitive_alpha0.999[ participant_id %in% superholder_id ]
# Consider their total interest in all entities
holders_subset <- psc[ participant_id %in% unique(holders_subset_ultimate$company_number) | company_number %in% unique(holders_subset_ultimate$company_number) ]
holders_subset <- holders_subset[ !is.na(equity_share)]
holders_subset <- holders_subset[ participant_id != company_number]
# Share to 0-1 range
setnames(holders_subset, "equity_share", "share")
holders_subset[, share := share/100]
## Create an object with vertex metadata
vertex_metadata <- unique(rbind(unique(data.table( name = holders_subset$company_number, type = holders_subset$company_type)),
data.table( name = holders_subset$participant_id, type = holders_subset$participant_type)
), by = "name")
# Count all vertex types
out <- vertex_metadata[, .N, by = "type"]
# Transpose
out <- dcast(melt(out, id.vars = "type"), variable ~ type)
out[, variable := NULL ]
# Keep only the columns of interest in the order of interest
types_of_interest <- c("SH", "ST", "I", "C")
names_outside_types_of_interest <- types_of_interest[!(types_of_interest %in% names(out))]
if( length(names_outside_types_of_interest) > 0) {
out[, c(names_outside_types_of_interest) := 0]
}
out <- out[, types_of_interest, with = F]
# Add the network size
out <- data.table(participant_id = superholder_id, nodes = nrow(vertex_metadata), out)
top_holders_network_chars <- rbind(top_holders_network_chars, out, fill = T)
message(superholder_id)
}
# Add to the top holders data
top_holders <- merge(top_holders, top_holders_network_chars, by = "participant_id", sort = F)
# Write to CSV
fwrite(top_holders, file = "output/alphaicon_paper/uk_organisations_top100_holders_2021_long_2aug21.csv")
# Produce a TeX table
#stargazer(top_holders[1:25, -"participant_id"], type = "latex", summary = F, out = "output/alphaicon_paper/uk_organisations_top25_holders_2021_long_2aug21.tex", float = F, multicolumn = T, digits = 1)
# Export the information on top-25 entities with the largest deviation between ranks
combined_holdings <- merge(holdings_dpi, holdings_npi, by = "participant_id", all = T)
combined_holdings <- merge(combined_holdings, holdings_transitive, by = "participant_id", all = T)
combined_holdings[, diff_npi_dpi := sumnpi - sumdpi ]
combined_holdings[, diff_transitive_dpi := sumtransitive - sumdpi ]
combined_holdings[, diff_transitive_npi := sumtransitive - sumnpi ]
top100_holders_diff_npi_dpi <- setorderv(combined_holdings, "diff_npi_dpi", -1, na.last = T)[1:100]
top100_holders_diff_transitive_dpi <- setorderv(combined_holdings, "diff_transitive_dpi", -1, na.last = T)[1:100]
top100_holders_diff_transitive_npi <- setorderv(combined_holdings, "diff_transitive_npi", -1, na.last = T)[1:100]
# Export point
#fwrite(top100_holders_diff_npi_dpi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_npi_dpi_2021_long_2aug21.csv")
#fwrite(top100_holders_diff_transitive_dpi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_dpi_2021_long_2aug21.csv")
#fwrite(top100_holders_diff_transitive_npi, file = "output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_npi_2021_long_2aug21.csv")
############
# Agreement between rankings
# Add the ranks (takes time)
for(obj in holder_objects) {
# Create temporary variable prioritising super-holders
if( grepl("transitive", obj) ) {
get(obj)[, superholder := 0]
get(obj)[participant_type == "SH", superholder := 1]
}
# Sort by descending share within companies
if( grepl("transitive", obj) ) {
setorderv(get(obj), c("company_number", "superholder", "share"), c(1, -1, -1))
} else {
setorderv(get(obj), c("company_number", "share"), c(1, -1))
}
# Add the rank that prioritises super-holders
get(obj)[, rank := 1:.N, by = c("company_number")]
message(obj)
}
# Combine the ranks from all the methods
vars_to_keep <- c("company_number", "participant_id", "rank")
## DPI and NPI
holders_rankings <- merge(holders_dpi[, c(vars_to_keep), with = F], holders_npi[, c(vars_to_keep), with = F], by = c("company_number", "participant_id"), all = T)
setnames(holders_rankings, c("rank.x", "rank.y"), c("rank_dpi", "rank_npi"))
## Add transitive holder hankings (only super-holders or top-ranking non-super-holders)
for( a in alphas) {
holders_rankings <- merge(holders_rankings, get(paste0("holders_transitive_alpha", a))[ participant_type == "SH" | (participant_type != "SH" & rank == 1), c(vars_to_keep), with = F], by = c("company_number", "participant_id"), all = T)
setnames(holders_rankings, c("rank"), paste0("rank_transitive", a))
}
# Count holders by company from each method
holders_rankings[, countholders_dpi := sum(!is.na(rank_dpi)), by = "company_number"]
holders_rankings[, countholders_npi := sum(!is.na(rank_npi)), by = "company_number"]
for( a in alphas) {
holders_rankings[, paste0("countholders_transitive", a) := sum(!is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
message(a)
}
# Pairs of counts of holders by company
holders_rankings[, countholders_dpi_npi := sum(!is.na(rank_dpi) & !is.na(rank_npi)), by = "company_number"]
for( a in alphas) {
holders_rankings[, paste0("countholders_dpi_transitive", a) := sum(!is.na(rank_dpi) & !is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
holders_rankings[, paste0("countholders_npi_transitive", a) := sum(!is.na(rank_npi) & !is.na(get(paste0("rank_transitive", a)))), by = "company_number"]
message(a)
}
## Compute company-level Kendal's tau-b rank correlation coefficient
### (takes about 15 minutes per pair)
tau_dpi_npi <- holders_rankings[ countholders_dpi_npi > 1 & !is.na(rank_dpi) & !is.na(rank_npi), list(kendalltau_dpi_npi = cor(rank_dpi, rank_npi, method = "kendall", use = "pairwise.complete.obs")), by = "company_number"]
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- holders_rankings[ countholders_dpi_transitive", a, " > 1 & !is.na(rank_dpi) & !is.na(rank_transitive", a, "), list(kendalltau_dpi_transitive", a, " = cor(rank_dpi, rank_transitive", a, ", method = 'kendall', use = 'pairwise.complete.obs')), by = 'company_number']")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- holders_rankings[ countholders_npi_transitive", a, " > 1 & !is.na(rank_npi) & !is.na(rank_transitive", a, "), list(kendalltau_npi_transitive", a, " = cor(rank_npi, rank_transitive", a, ", method = 'kendall', use = 'pairwise.complete.obs')), by = 'company_number']")))
message(a)
}
# Examples of companies with complex rankings
#holders_rankings[ company_number == "00026306"]
#holders_rankings[ company_number == "00000529"]
#holders_rankings[ company_number == "00018751"]
# Debug: these should be zeros
#holders_rankings[ countholders_dpi == 1 & countholders_npi == 1 & rank_dpi != rank_npi]
#holders_rankings[ countholders_transitive0.999 == 1 & countholders_npi == 1 & rank_transitive0.999 != rank_npi]
# Add the tau-b = 1 for companies with one participant (both direct and indirect)
tau_dpi_npi <- rbind(tau_dpi_npi, data.table(company_number = unique(holders_rankings[ countholders_dpi == 1 & countholders_npi == 1 & !is.na(rank_dpi) & !is.na(rank_npi) ]$company_number), kendalltau_dpi_npi = 1))
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- rbind(tau_dpi_transitive", a, ", data.table( company_number = unique(holders_rankings[ countholders_dpi == 1 & countholders_transitive", a, " == 1 & !is.na(rank_dpi) & !is.na(rank_transitive", a, ")]$company_number), kendalltau_dpi_transitive", a, " = 1))")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- rbind(tau_npi_transitive", a, ", data.table( company_number = unique(holders_rankings[ countholders_npi == 1 & countholders_transitive", a, " == 1 & !is.na(rank_npi) & !is.na(rank_transitive", a, ")]$company_number), kendalltau_npi_transitive", a, " = 1))")))
}
# Add the tau-b = -1 for companies with differing number of participants
# where ranks do not match. Example: holders_rankings[ company_number == "00026306"]
tau_dpi_npi <- rbind(tau_dpi_npi, data.table(company_number = unique(holders_rankings[ !(company_number %in% tau_dpi_npi$company_number)]$company_number), kendalltau_dpi_npi = -1))
for( a in alphas) {
eval(parse(text = paste0("tau_dpi_transitive", a, " <- rbind(tau_dpi_transitive", a, ", data.table( company_number = unique(holders_rankings[ !(company_number %in% tau_dpi_transitive", a, "$company_number) ]$company_number), kendalltau_dpi_transitive", a, " = -1))")))
eval(parse(text = paste0("tau_npi_transitive", a, " <- rbind(tau_npi_transitive", a, ", data.table( company_number = unique(holders_rankings[ !(company_number %in% tau_npi_transitive", a, "$company_number) ]$company_number), kendalltau_npi_transitive", a, " = -1))")))
}
# Finally, we need to fill in NAs like tau_dpi_transitive0.999[company_number == "00001615"]
# holders_rankings[ company_number == "00001615" ]
# holders_rankings[ company_number == "SO307326" ]
# holders_rankings[ company_number == "00000529" ]
# Merge the taus
for( a in alphas) {
if( which(alphas == a) == 1 ) {
kendall_taus_participant_ranks <- merge(tau_dpi_npi, get(paste0("tau_dpi_transitive", a)), by = "company_number", all = T)
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_npi_transitive", a)), by = "company_number", all = T)
} else {
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_dpi_transitive", a)), by = "company_number", all = T)
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, get(paste0("tau_npi_transitive", a)), by = "company_number", all = T)
}
message(a)
}
# Add the information on the count of holders
kendall_taus_participant_ranks <- merge(kendall_taus_participant_ranks, unique(holders_rankings[, c("company_number", "countholders_dpi", "countholders_npi", paste0("countholders_transitive", alphas)), with = F], by = "company_number"), by = "company_number", all.x = T, all.y = F)
# Save point
fwrite(kendall_taus_participant_ranks, file = "output/alphaicon_paper/kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
############
# Analyse the agreement between rankings
library(data.table)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(stringi)
library(stringr)
library(stargazer)
library(showtext)
setwd(Sys.getenv('ALPHAICON_PATH'))
# Add the font to use
font_add_google("Open Sans", "Open Sans")
showtext_auto()
# Load the produced CSV with Kendall's tau-b
taus_longs <- fread("output/alphaicon_paper/kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv", encoding = "UTF-8")
# To long form (each line is method and alpha)
taus <- melt(taus_longs, id.vars = "company_number", measure.vars = names(taus_longs)[grepl("kendalltau", names(taus_longs))], variable.name = "pair", value.name = "tau", na.rm = T, variable.factor = T)
countholders <- melt(taus_longs, id.vars = "company_number", measure.vars = names(taus_longs)[grepl("countholders", names(taus_longs))], variable.name = "algorithm", value.name = "countholders", na.rm = T, variable.factor = T)
# Convert pairs to extract useful information
alg_pairs <- data.table(pair = levels(taus$pair))
alg_pairs[, pair_modif := gsub("kendalltau_", "", pair, fixed = T)]
alg_pairs[, c("algorithm1", "algorithm2") := as.data.table(stri_split_fixed(pair_modif, "_", n = 2, simplify = T))]
alg_pairs[, pair_modif := NULL]
alg_pairs[, pair := factor(pair, levels = levels(taus$pair))]
alg_pairs[, alpha := NA_real_ ]
alg_pairs[ grepl("transitive", algorithm2), alpha := as.numeric(gsub("transitive", "", algorithm2))]
alg_pairs[ grepl("transitive", algorithm2), algorithm2 := gsub("[^transitive]", "", algorithm2)]
alg_pairs[, algorithm1 := factor(algorithm1, levels = c("dpi", "npi", "transitive"))]
alg_pairs[, algorithm2 := factor(algorithm2, levels = c("dpi", "npi", "transitive"))]
taus <- merge(taus, alg_pairs, by = "pair")
taus[, pair := NULL]
gc()
# Same for countholders
alg_counts <- data.table(algorithm = levels(countholders$algorithm))
alg_counts[, alg_modif := gsub("countholders_", "", algorithm)]
alg_counts[ grepl("transitive", alg_modif), alpha := as.numeric(gsub("transitive", "", alg_modif))]
alg_counts[ grepl("transitive", alg_modif), alg_modif := gsub("[^transitive]", "", alg_modif)]
alg_counts[, alg_modif := factor(alg_modif, levels = c("dpi", "npi", "transitive"))]
countholders <- merge(countholders, alg_counts, by = "algorithm")
countholders[, algorithm := NULL]
setnames(countholders, "alg_modif", "algorithm")
gc()
# Add count holders to tau
#taus <- merge(taus, countholders, by.x = c("algorithm1", "alpha"), by.y = c("algorithm", "alpha"))
#setnames(taus, "countholders", "countholders1")
#taus <- merge(taus, countholders, by.x = c("algorithm2", "alpha"), by.y = c("algorithm", "alpha"))
#setnames(taus, "countholders", "countholders2")
# Add entity industry to the data (peak use of 40 GB)
taus[, industrysection := uk_basic_companies_data[ match(taus$company_number, uk_basic_companies_data$CompanyNumber)]$industrysection_1 ]
gc()
# Summary statistics of Kendall's tau-b
## Mean, overall
taus_overall_mean <- taus[!is.na(tau), list(tau = mean(tau), tausemean = sd(tau)/sqrt(.N)), by = c("algorithm1", "algorithm2", "alpha")]
## Mean, by industry
taus_industry_mean <- taus[!is.na(tau) & !is.na(industrysection), list( tau = mean(tau), tausemean = sd(tau)/sqrt(.N)), by = c("algorithm1", "algorithm2", "alpha", "industrysection")]
# Export point
#fwrite(taus_overall_mean, file = "output/alphaicon_paper/mean_kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
#fwrite(taus_industry_mean, file = "output/alphaicon_paper/mean_by_industrysection_kendall_taus_participant_ranks_dpi_npi_transitive_uk_organisations_participants_2021_7sep21.csv")
# Produce the plot with overall mean
taus_overall_mean[ grepl("dpi|npi", algorithm1), algorithm1 := toupper(algorithm1)]
## Other algorithms vs best and worst-performing transitive algorithm
taus_mean_plot <- ggplot(aes(x = alpha, y = tau, group = algorithm1, color = algorithm1), data = taus_overall_mean[ algorithm1 == "NPI" & algorithm2 == "transitive"]) +
geom_line(size = 1.5, alpha = 1) +
geom_point(size = 2) +
geom_text(aes(label = round(tau, 4)), size = 2.5, nudge_y = 0.0005) +
scale_x_continuous( breaks = c(seq(0, 0.9, 0.1), 0.999), labels = c(seq(0, 0.9, 0.1), 0.999), guide = guide_axis(n.dodge = 1)) +
#facet_wrap(. ~ algorithm1, scales = "free_y")+
labs(y = expression("Kendall's "~tau~"-b betw. NPI\nand "~alpha~"-ICON SH ranks"), x = expression(alpha)) +
scale_colour_brewer(type = "qual", palette = 2) +
theme_minimal() + theme(legend.position = "none", text = element_text(size = 14, family = "Open Sans"), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
#ggsave(taus_mean_plot, file = "output/alphaicon_paper/taus_mean_plot.pdf", width = 8.5, height = 4, device = cairo_pdf, scale = 1)
# Export to a table
#stargazer(taus_overall_mean[ algorithm1 == "NPI" & algorithm2 == "transitive"], type = "latex", summary = F, out = "output/alphaicon_paper/taus_overall_mean.tex", float = F, multicolumn = T, digits = 5)