Introduction

# load packages
library(here)
library(rvest)
library(tidyverse)

This project was inspired by this tweet, which lists some name chains of NHL players. I wanted to create a program that would generate name chains; however, I wanted to test it on a new data set: NBA players. This project is very much still in progress: I am learning new ways of improving it every day!

The repository for this project contains:

Data is up to date as of 2019/07/26.

Data

All data comes from Basketball Reference.

First, scrape all of Basketball Reference’s player codes from each “NBA & ABA Players with Last Names Starting with [Letter]” page.

Don’t scrape the names directly from these pages; they often list a player’s nickname, not legal first name.

# main part of the basketball-reference url
main.url <- "https://www.basketball-reference.com/players/"

# list of letters in the alphabet except for 'x' - there are no NBA players with last names starting with 'x'
letter.list <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
                 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "y", "z")

all_codes <- list()

for(letter in letter.list){
  
  # read in letter page
  url_codes <- paste0(main.url, letter, "/")
  page_codes <- read_html(url_codes)
  
  # get player codes
  codes <- page_codes %>%
    html_nodes("table#players > tbody > tr > th") %>%
    html_attr("data-append-csv") %>%
    as.list()
  
  all_codes <- c(all_codes, codes)
}

# save locally
saveRDS(all_codes, here("save", "NBA_ABA_codes.rds"))

Then, use the player codes to scrape the name of each player.

# all_codes <- readRDS(here("save", "NBA_ABA_codes.rds"))
# main.url <- "https://www.basketball-reference.com/players/"

all_names <- list()

for(code in all_codes){
  
  letter <- substr(code, 1, 1)
  
  url <- paste0(main.url, letter, "/", code, ".html")
  page <- read_html(url)
  
  # get full name
  # when testing on the 'a' names, I found that there were three (3) different selectors
  name <- page %>%
    html_nodes("#meta > div > p:nth-child(2) > strong > strong") %>%
    html_text()
  if(identical(name, character(0))){
    name <- page %>%
      html_nodes("#meta > div:nth-child(2) > p:nth-child(3) > strong > strong") %>%
      html_text()
  }
  if(identical(name, character(0))){
    name <- page %>%
      html_nodes("#meta > div > p:nth-child(3) > strong > strong") %>%
      html_text()
  }
  all_names <- c(all_names, name)
  
}

# save locally
saveRDS(all_names, here("save", "NBA_ABA_full_names.rds"))

Now, clean the names: remove suffixes, middle names, and duplicates.

# all_names <- readRDS(here("save", "NBA_ABA_full_names.rds"))

# remove suffixes and split into first, middle, last, etc.
final_names <- all_names %>%
  str_replace(" Jr.", "") %>%
  as.list() %>%
  str_replace(" Sr.", "") %>%
  as.list() %>%
  str_replace(" IV", "") %>%
  as.list() %>%
  str_replace(" III", "") %>%
  as.list() %>%
  str_replace(" II", "") %>%
  as.list() %>% 
  str_split(" ")

# turn this into a tibble
final_names <- final_names %>% 
  plyr::ldply(rbind) %>% 
  as_tibble()
colnames(final_names) <- c("first", "two", "three", "four", "five")

final_names <- final_names %>%
  # convert to character & create last name column
  mutate(first = as.character(first),
         two = as.character(two),
         three = as.character(three), 
         four = as.character(four),
         five = as.character(five),
         last = case_when(!is.na(five) ~ five,
                          !is.na(four) ~ four,
                          !is.na(three) ~ three,
                          !is.na(two) ~ two)) %>%
  # fix certain names - manual inspection
  mutate(first = case_when(last == "Watanabe" & first == "Watanabe" ~ "Yuta",
                           last == "Navarro" & first == "Juan" ~ "Juan Carlos",
                           TRUE ~ first), 
         last = case_when(last == "Salle" & first == "George" ~ "Bon Salle",
                          last == "Colo" & first == "Nando" ~ "De Colo",
                          last == "Negro" & first == "Vincent" ~ "Del Negro",
                          last == "Arsdale" & first == "Richard" ~ "Van Arsdale",
                          last == "Arsdale" & first == "Thomas" ~ "Van Arsdale",
                          last == "Kolff" & first == "Willem" ~ "van Breda Kolff",
                          last == "Kolff" & first == "Jan" ~ "van Breda Kolff",
                          last == "Exel" & first == "Nickey" ~ "Van Exel",
                          last == "Horn" & first == "Keith" ~ "Van Horn",
                          last == "Lier" & first == "Norman" ~ "Van Lier",
                          last == "Zant" & first == "Dennis" ~ "Van Zant",
                          last == "Velden" & first == "Logan" ~ "Vander Velden",
                          last == "Peace" & first == "Metta" ~ "World Peace",
                          TRUE ~ last)) %>% 
  # keep only first and last names
  select(first, last) %>% 
  # remove duplicates
  distinct() 

# save locally
saveRDS(final_names, here("save", "NBA_ABA_final_names.rds"))

Name Chains

First, create the function needed to produce name chains.

This function creates a set of name chains of a user-inputted length.

Inputs:
  prev_set: (tbl) the previous set of name chains (contains one fewer name)
  names: (tbl) the list of names to search through
  num: (int) the number of first-last name pairs (i.e. the number of people)
Returns:
  A tibble of name chains
create_chains <- function(prev_set, names, num){
  

  this_set = tibble()
  chain = as_tibble(matrix(ncol = num + 1, 
                           dimnames = list(NULL, paste0("col", seq_len(num + 1)))))
  
  for(i in 1:nrow(prev_set)){
    for(j in 1:nrow(names)){
      if(prev_set[i,num] == names[j,1]){
        chain[1,][1:num] = prev_set[i,][1:num]
        chain[1,num+1] = names[j,2]
        
        this_set <- rbind(this_set, chain)
      }
    }
  }
  return(this_set)
}

Now, build the name chains sets.

# final_names <- readRDS(here("save", "NBA_ABA_final_names.rds"))

# reasonable starting points: the people whose last name is someone else's first name
chains_00 <- tibble()
for(i in 1:nrow(final_names)){
  if(final_names[i,2] %in% final_names$first){
    chains_00 <- bind_rows(chains_00, final_names[i,])
  }
}

# chains of two people
step_01 <- final_names %>% filter(first %in% last) # here, we shorten the process slightly
                                                   # by only searching through the people
                                                   # whose first name
                                                   # is someone else's last name
chains_01 <- create_chains(chains_00, step_01, 2)

# chains of three people
step_02 <- final_names %>% filter(first %in% step_01$last)
chains_02 <- create_chains(chains_01, step_02, 3)

# chains of four people
step_03 <- final_names %>% filter(first %in% step_02$last)
chains_03 <- create_chains(chains_02, step_03, 4)

# chains of five people
step_04 <- final_names %>% filter(first %in% step_03$last)
chains_04 <- create_chains(chains_03, step_04, 5)

# chains of six people
step_05 <- final_names %>% filter(first %in% step_04$last)
chains_05 <- create_chains(chains_04, step_05, 6)

# chains of seven people
step_06 <- final_names %>% filter(first %in% step_05$last)
chains_06 <- create_chains(chains_05, step_06, 7)

# chains of eight people
step_07 <- final_names %>% filter(first %in% step_06$last)
chains_07 <- create_chains(chains_06, step_07, 8)

# chains of nine people
step_08 <- final_names %>% filter(first %in% step_07$last)
chains_08 <- create_chains(chains_07, step_08, 9)

# chains of ten people
step_09 <- final_names %>% filter(first %in% step_08$last)
chains_09 <- create_chains(chains_08, step_09, 10)

# chains of eleven people
step_10 <- final_names %>% filter(first %in% step_09$last)
chains_10 <- create_chains(chains_09, step_10, 11)

# chains of twelve people
step_11 <- final_names %>% filter(first %in% step_10$last)
chains_11 <- create_chains(chains_10, step_11, 12)

# chains of thirteen people
step_12 <- final_names %>% filter(first %in% step_11$last)
chains_12 <- create_chains(chains_11, step_12, 13)

# chains of fourteen people
step_13 <- final_names %>% filter(first %in% step_12$last)
chains_13 <- create_chains(chains_12, step_13, 14)

# chains of fifteen people
step_14 <- final_names %>% filter(first %in% step_13$last) # final step!
chains_14 <- create_chains(chains_13, step_14, 15)

There are two (2) name chains with fifteen people:

## Tellis Frank Brian Howard Carl Henry James Allen Terry Thomas Brandon Paul George Wilson Chandler Hutchinson
## Tellis Frank Brian Howard Carl Henry James Allen Terry Thomas Brandon Paul George Wilson Chandler Parsons

The final product: bind all the sets of name chains together!

# the full list of name chains - bind all the chains tibbles together
all_name_chains <- list(chains_01, chains_02, chains_03, chains_04, chains_05, chains_06, chains_07,
                        chains_08, chains_09, chains_10, chains_11, chains_12, chains_13, chains_14) %>% 
  bind_rows()

# save locally
saveRDS(all_name_chains, here("save", "NBA_ABA_name_chains.rds"))