alphaicon/code/alphaicon_paper/4_illustrate_algorithm.r

258 lines
15 KiB
R
Raw Permalink Normal View History

2021-09-16 08:06:49 +01:00
# This code plots the selected sub-networks
# in the PSC data and produces the figures
# for the α-ICON paper
library(data.table)
library(igraph)
library(stringi)
library(stringr)
library(ggplot2)
library(ggnetwork)
library(ggthemes)
library(showtext)
# Add the font to use
font_add_google("Open Sans", "Open Sans")
showtext_auto()
# Declare working directory beforehand in an environment variable
# ALPHAICON_PATH = "path_to_your_folder"
# with the aid of usethis::edit_r_environ()
# Restart R session for the changes to make effect
setwd(Sys.getenv('ALPHAICON_PATH'))
# Load the active participants snapshot of PSC (prepared by data_preparation/uk/2_psc_snapshot_to_participants_panel.r)
psc <- fread("output/uk/uk_organisations_participants_2021_long_2aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8")
# Load the companies data (prepared by data_preparation/uk/1b_process_companies_data.r)
load("data/uk/uk_basic_companies_data_2021-08-01.rdata")
# Load the transitive ownership of each participant at alpha = 0.999
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
holders <- fread("output/uk/transitive/uk_organisations_transitive_ownership_alpha0.999_2021_long_2aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8", colClasses = c("character", "character", "numeric"))
# Load the data on graph membership
# (prepared by alphaicon_paper/1_compute_alphaicon.ipynb)
graph_membership <- fread("output/uk/uk_organisations_participation_graph_core_periphery_membership_6aug21.csv", integer64 = "character", na.strings = "", encoding = "UTF-8", header = T, colClasses = c("numeric", "character", "factor"))
graph_membership[, V1 := NULL ]
setnames(graph_membership, "company_number/id", "participant_id")
gc()
# We need to pick up an illustrative organization
# Ideally this organization should have a small network size
# and have at least one entity in graph core
# Orgs in the core with all other types in the subgraph
# and manageable network size
temp <- holders[ participant_id %in% graph_membership[ type == "C"]$participant_id ]
temp[, company_type := graph_membership[ match(temp$company_number, graph_membership$participant_id) ]$type ]
temp[, participant_type := graph_membership[ match(temp$participant_id, graph_membership$participant_id) ]$type ]
# Devise a set of candidate orgs for illustration
# where all organization types are present in the company network
candidate_orgs <- temp[, list( company_types = uniqueN(company_type), network_size = uniqueN(company_number)), by = "participant_id"]
# Consider moderately sized networks with all types of orgs there
candidate_ids <- unique(candidate_orgs[ company_types > 2 & network_size <= 30 & network_size > 10]$participant_id)
# Additionally, consider the largest structure in terms
# of graph path from participant to company
# This is the result of igraph::farthest_vertices
# applied to the psc graph
farthest_components <- c("11443992")
# Also consider participant SEAN DINNEN
# he has DPI and NPI of 0 but very large transitivity score
sean_dinnen <- "SEAN$MICHAEL$DINNEN$1968$3"
# Example of long path
heidelbergcement <- "HEIDELBERGCEMENT AG$GERMANY$HRB 33082"
# Finally, consider the organizations with large difference between
# the super-holder scores (computed in alphaicon_paper/6_rank_top_holders.r)
top100_holders_diff_npi_dpi <- fread("output/alphaicon_paper/uk_organisations_top100_holders_diff_npi_dpi_2021_long_2aug21.csv", encoding = "UTF-8")
top100_holders_diff_transitive_dpi <- fread("output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_dpi_2021_long_2aug21.csv", encoding = "UTF-8")
top100_holders_diff_transitive_npi <- fread("output/alphaicon_paper/uk_organisations_top100_holders_diff_transitive_npi_2021_long_2aug21.csv", encoding = "UTF-8")
# Define the candidates to consider (from affiliated_entities
# and other inspection)
organizations_of_interest <- unique(c("IHS MARKIT LTD$BERMUDA$48610", "06647317", "11594795", candidate_ids, farthest_components, sean_dinnen))
# Or specify one organization for the graph in the paper:
#organizations_of_interest <- "03782947" # OPUS
organizations_of_interest <- c(top100_holders_diff_transitive_dpi[1]$participant_id, # SPECSAVERS OPTICAL SUPERSTORES LTD
top100_holders_diff_npi_dpi[82]$participant_id, # LENDLEASE INTERNATIONAL PTY LIMITED
top100_holders_diff_transitive_dpi[99]$participant_id, # THE BERKELEY GROUP PLC
top100_holders_diff_transitive_npi[17]$participant_id) # BAJLINDER$KAUR$BOPARAN$1968$1. NB: her husband RANJIT$SINGH$BOPARAN$1966$8 has alpha-ICON sum 73
# Iterate over multiple graph candidates
for( org_interest in organizations_of_interest ) {
# Ultimate companies held by this organization
holders_subset_ultimate <- holders[ participant_id %in% org_interest ]
# Consider their total interest in all entities
holders_subset <- psc[ participant_id %in% unique(holders_subset_ultimate$company_number) | company_number %in% unique(holders_subset_ultimate$company_number) ]
holders_subset <- holders_subset[ !is.na(equity_share)]
setnames(holders_subset, "equity_share", "share")
# Network size
#uniqueN(holders_subset$company_number)
#uniqueN(holders_subset$participant_id)
# Add company and participant names and countries of origin
holders_subset[, company_name := uk_basic_companies_data[ match(holders_subset$company_number, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
#holders_subset[, company_country := uk_basic_companies_data[ match(holders_subset$company_number, uk_basic_companies_data$CompanyNumber)]$CountryOfOrigin ]
holders_subset[, participant_name := uk_basic_companies_data[ match(holders_subset$participant_id, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
#holders_subset[, participant_country := uk_basic_companies_data[ match(holders_subset$participant_id, uk_basic_companies_data$CompanyNumber)]$CountryOfOrigin ]
# Proper name handling for individuals: first name-surname
holders_subset[grepl("$", participant_id, fixed = T) & kind != "individual", participant_name := as.data.table(stri_split_fixed(participant_id, "$", n = 3, simplify = T)[,1])]
holders_subset[kind == "individual", participant_name := paste0(stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 1], " ", stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 3])]
#holders_subset[, company_country := gsub("United Kingdom", "UK", company_country) ]
#holders_subset[, participant_country := gsub("United Kingdom", "UK", participant_country) ]
# Are the names unique?
#uniqueN(holders_subset$company_name)
#uniqueN(holders_subset$participant_name)
# Add the graph types of organizations
holders_subset[, company_type := graph_membership[ match(holders_subset$company_number, graph_membership$participant_id) ]$type ]
holders_subset[, participant_type := graph_membership[ match(holders_subset$participant_id, graph_membership$participant_id) ]$type ]
holders_subset[, share_range := NA_character_ ]
holders_subset[ share == 87.5, share_range := "75-100%"]
holders_subset[ share == 62.5, share_range := "50-75%"]
holders_subset[ share == 37.5, share_range := "25-50%"]
## Total number of holdings
holdings_count <- holders_subset[, list(count_holdings_total = .N), by = "participant_name"]
## Total number of holders
holders_count <- holders_subset[, list(count_holders_total = .N), by = "company_name"]
# To igraph object
## Create an object with vertex metadata
vertex_metadata <- unique(rbind(unique(data.table( name = holders_subset$company_name, type = holders_subset$company_type)),
data.table( name = holders_subset$participant_name, type = holders_subset$participant_type)
), by = "name")
vertex_metadata <- merge(vertex_metadata, holdings_count, by.x = "name", by.y = "participant_name", all.x = T, all.y = F, sort = F)
vertex_metadata <- merge(vertex_metadata, holders_count, by.x = "name", by.y = "company_name", all.x = T, all.y = F, sort = F)
vertex_metadata[is.na(count_holdings_total), count_holdings_total := 0]
vertex_metadata[is.na(count_holders_total), count_holders_total := 0]
# NA to mock 0
vertex_metadata[count_holdings_total == 0, count_holdings_total := 1]
# Remove intermediaries with no holders
vertex_metadata <- vertex_metadata[ !(count_holders_total == 0 & type == "I") ]
## Convert to igraph
holders_example_graph <- graph_from_data_frame(holders_subset[participant_name %in% vertex_metadata$name & company_name %in% vertex_metadata$name, c("participant_name", "company_name", "share", "share_range")], directed = T, vertices = vertex_metadata)
# Line-wrap the name
V(holders_example_graph)$name_label <- str_wrap(V(holders_example_graph)$name, 15)
# Graph to ggnetwork
holders_example_net <- ggnetwork(holders_example_graph, layout = with_kk(), arrow.gap = 0.01)
# Empty name for certain types if network is large
if(nrow(holders_example_net) > 150) {
holders_example_net[!(holders_example_net$type %in% c("SH", "C") | holders_example_net$name %in% c("SPECSAVERS UK HOLDINGS LIMITED", "HANSON PACKED PRODUCTS LIMITED", "HOUSERATE LIMITED", "HANSON LIMITED", "HEIDELBERGCEMENT UK HOLDING LIMITED") ), "name_label"] <- ""
}
# Plot the graph
holders_example_plot <- ggplot(holders_example_net, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "#ebecf0", arrow = arrow(angle = 45, length = unit(5, "pt"), type = "closed"), show.legend = F) +
geom_nodes(aes(color = type, size = log(count_holdings_total)), show.legend = T) +
{ if( org_interest %in% c("03782947", "PHILIP$AULD$MACTAGGART$1956$2", "BAJLINDER$KAUR$BOPARAN$1968$1")) geom_edgetext(aes(label = share_range), color = "grey25", size = 1.5) } +
geom_nodetext_repel(aes(label = name_label, color = type), fontface = "bold", size = 2, show.legend = F, point.padding = 5, max.overlaps = 300) +
#{ if(nrow(holders_example_net) > 150) geom_nodetext_repel(aes(color = type), label = "", fontface = "bold", size = 2, show.legend = F, point.padding = 5) } +
#{ if(nrow(holders_example_net) <= 150) geom_nodetext_repel(aes(label = name, color = type), fontface = "bold", size = 2, show.legend = F, point.padding = 5) } +
scale_size(guide = "none") +
scale_color_manual(name = "node type", labels = c("super-holder", "super-target", "intermediary", "core"), values = c("SH" = "#66c2a5", "ST" = "#8da0cb", "I" = "grey", "C" = "#fc8d62")) +
guides(color = guide_legend(override.aes = list(size = 5))) +
theme_blank() + theme(text = element_text(family = "Open Sans"), legend.position = "bottom", plot.margin = unit(c(0, 0, 0, 0), "cm"))
# Export to an image
graph_file_name <- gsub("[[:punct:]]", "", stri_split_fixed(org_interest, "$", simplify = T)[1,1])
if( org_interest %in% c("SEAN$MICHAEL$DINNEN$1968$3", "SPECSAVERS OPTICAL SUPERSTORES LTD$UNITED KINGDOM$1721624") ) {
ggsave(paste0("output/alphaicon_paper/network_examples/uk_example ", graph_file_name, ".pdf"), height = 20, width = 20, scale = 1.2, device = cairo_pdf)
} else {
ggsave(paste0("output/alphaicon_paper/network_examples/uk_example ", graph_file_name, ".pdf"), height = 10, width = 10, device = cairo_pdf)
}
message(graph_file_name)
}
########
# Plot the graph with the core components only
core <- psc[ participant_id %in% unique(graph_membership[type == "C"]$participant_id) & company_number %in% unique(graph_membership[type == "C"]$participant_id) ]
core <- core[ !is.na(equity_share)]
setnames(core, "equity_share", "share")
# Add company and participant names and countries of origin
core[, company_name := uk_basic_companies_data[ match(core$company_number, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
core[, participant_name := uk_basic_companies_data[ match(core$participant_id, uk_basic_companies_data$CompanyNumber)]$CompanyName ]
# Proper name handling for individuals: first name-surname
core[grepl("$", participant_id, fixed = T) & kind != "individual", participant_name := as.data.table(stri_split_fixed(participant_id, "$", n = 3, simplify = T)[,1])]
core[kind == "individual", participant_name := paste0(stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 1], " ", stri_split_fixed(participant_id, "$", n = 4, simplify = T)[, 3])]
# Add the graph types of organizations
core[, company_type := graph_membership[ match(core$company_number, graph_membership$participant_id) ]$type ]
core[, participant_type := graph_membership[ match(core$participant_id, graph_membership$participant_id) ]$type ]
core[, share_range := NA_character_ ]
core[ share == 87.5, share_range := "75-100%"]
core[ share == 62.5, share_range := "50-75%"]
core[ share == 37.5, share_range := "25-50%"]
# Add the total number of companies where core participants
## Total number of holdings
core_holdings <- holders[participant_id %in% graph_membership[type == "C"]$participant_id, list(count_holdings_total = .N), by = "participant_id"]
## Holdings in core
core_holdings <- merge(core_holdings, holders[participant_id %in% graph_membership[type == "C"]$participant_id & company_number %in% graph_membership[type == "C"]$participant_id, list(count_holdings_core = .N), by = "participant_id"], by = "participant_id", all = T)
# To igraph object
## Create an object with vertex metadata
vertex_metadata <- unique(rbind(unique(data.table(name = core$company_name, type = core$company_type, id = core$company_number)),
data.table( name = core$participant_name, type = core$participant_type, id = core$participant_id)
), by = "name")
# Add the counts of holdings
vertex_metadata <- merge(vertex_metadata, core_holdings, by.x = "id", by.y = "participant_id", all = T)
vertex_metadata[, id := NULL]
# Keep only non-trivial components
vertex_metadata <- vertex_metadata[ count_holdings_core > 2 ]
## Convert to igraph
core_graph <- graph_from_data_frame(core[ participant_name %in% vertex_metadata$name | company_name %in% vertex_metadata$name, c("participant_name", "company_name", "share", "share_range")], directed = T, vertices = vertex_metadata)
# Line-wrap the name
V(core_graph)$name_label <- str_wrap(V(core_graph)$name, 15)
# Graph to ggnetwork
core_net <- ggnetwork(core_graph, layout = with_fr(), arrow.gap = 0.01)
# Plot the graph
core_plot <- ggplot(core_net, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "grey", arrow = arrow(angle = 45, length = unit(3, "pt"), type = "closed"), show.legend = F) +
geom_nodes(aes(color = type, size = count_holdings_total), show.legend = T) +
geom_edgetext(aes(label = share_range), color = "grey25", size = 1) +
geom_nodetext_repel(aes(label = name_label, color = type), fontface = "bold", size = 2, show.legend = F, point.padding = 5) +
scale_size(guide = "none") +
scale_color_manual(name = "node type", labels = c("super-holder", "super-target", "intermediary", "core"), values = c("SH" = "#66c2a5", "ST" = "#8da0cb", "I" = "grey", "C" = "#fc8d62")) +
guides(color = guide_legend(override.aes = list(size = 5))) +
theme_blank() + theme(text = element_text(family = "Open Sans"), legend.position = "none", plot.margin = unit(c(0, 0, 0, 0), "cm"))
ggsave(paste0("output/network_examples/core_plot.pdf"), height = 10, width = 10, scale = 1.1, device = cairo_pdf)