# load packages
library(here)
library(rvest)
library(tidyverse)
This project was inspired by this tweet, which lists some name chains of NHL players. I wanted to create a program that would generate name chains; however, I wanted to test it on a new data set: NBA players. This project is very much still in progress: I am learning new ways of improving it every day!
The repository for this project contains:
Data is up to date as of 2019/07/26.
All data comes from Basketball Reference.
First, scrape all of Basketball Reference’s player codes from each “NBA & ABA Players with Last Names Starting with [Letter]” page.
Don’t scrape the names directly from these pages; they often list a player’s nickname, not legal first name.
# main part of the basketball-reference url
main.url <- "https://www.basketball-reference.com/players/"
# list of letters in the alphabet except for 'x' - there are no NBA players with last names starting with 'x'
letter.list <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "y", "z")
all_codes <- list()
for(letter in letter.list){
# read in letter page
url_codes <- paste0(main.url, letter, "/")
page_codes <- read_html(url_codes)
# get player codes
codes <- page_codes %>%
html_nodes("table#players > tbody > tr > th") %>%
html_attr("data-append-csv") %>%
as.list()
all_codes <- c(all_codes, codes)
}
# save locally
saveRDS(all_codes, here("save", "NBA_ABA_codes.rds"))
Then, use the player codes to scrape the name of each player.
# all_codes <- readRDS(here("save", "NBA_ABA_codes.rds"))
# main.url <- "https://www.basketball-reference.com/players/"
all_names <- list()
for(code in all_codes){
letter <- substr(code, 1, 1)
url <- paste0(main.url, letter, "/", code, ".html")
page <- read_html(url)
# get full name
# when testing on the 'a' names, I found that there were three (3) different selectors
name <- page %>%
html_nodes("#meta > div > p:nth-child(2) > strong > strong") %>%
html_text()
if(identical(name, character(0))){
name <- page %>%
html_nodes("#meta > div:nth-child(2) > p:nth-child(3) > strong > strong") %>%
html_text()
}
if(identical(name, character(0))){
name <- page %>%
html_nodes("#meta > div > p:nth-child(3) > strong > strong") %>%
html_text()
}
all_names <- c(all_names, name)
}
# save locally
saveRDS(all_names, here("save", "NBA_ABA_full_names.rds"))
Now, clean the names: remove suffixes, middle names, and duplicates.
# all_names <- readRDS(here("save", "NBA_ABA_full_names.rds"))
# remove suffixes and split into first, middle, last, etc.
final_names <- all_names %>%
str_replace(" Jr.", "") %>%
as.list() %>%
str_replace(" Sr.", "") %>%
as.list() %>%
str_replace(" IV", "") %>%
as.list() %>%
str_replace(" III", "") %>%
as.list() %>%
str_replace(" II", "") %>%
as.list() %>%
str_split(" ")
# turn this into a tibble
final_names <- final_names %>%
plyr::ldply(rbind) %>%
as_tibble()
colnames(final_names) <- c("first", "two", "three", "four", "five")
final_names <- final_names %>%
# convert to character & create last name column
mutate(first = as.character(first),
two = as.character(two),
three = as.character(three),
four = as.character(four),
five = as.character(five),
last = case_when(!is.na(five) ~ five,
!is.na(four) ~ four,
!is.na(three) ~ three,
!is.na(two) ~ two)) %>%
# fix certain names - manual inspection
mutate(first = case_when(last == "Watanabe" & first == "Watanabe" ~ "Yuta",
last == "Navarro" & first == "Juan" ~ "Juan Carlos",
TRUE ~ first),
last = case_when(last == "Salle" & first == "George" ~ "Bon Salle",
last == "Colo" & first == "Nando" ~ "De Colo",
last == "Negro" & first == "Vincent" ~ "Del Negro",
last == "Arsdale" & first == "Richard" ~ "Van Arsdale",
last == "Arsdale" & first == "Thomas" ~ "Van Arsdale",
last == "Kolff" & first == "Willem" ~ "van Breda Kolff",
last == "Kolff" & first == "Jan" ~ "van Breda Kolff",
last == "Exel" & first == "Nickey" ~ "Van Exel",
last == "Horn" & first == "Keith" ~ "Van Horn",
last == "Lier" & first == "Norman" ~ "Van Lier",
last == "Zant" & first == "Dennis" ~ "Van Zant",
last == "Velden" & first == "Logan" ~ "Vander Velden",
last == "Peace" & first == "Metta" ~ "World Peace",
TRUE ~ last)) %>%
# keep only first and last names
select(first, last) %>%
# remove duplicates
distinct()
# save locally
saveRDS(final_names, here("save", "NBA_ABA_final_names.rds"))
First, create the function needed to produce name chains.
This function creates a set of name chains of a user-inputted length.
Inputs:
prev_set: (tbl) the previous set of name chains (contains one fewer name)
names: (tbl) the list of names to search through
num: (int) the number of first-last name pairs (i.e. the number of people)
Returns:
A tibble of name chains
create_chains <- function(prev_set, names, num){
this_set = tibble()
chain = as_tibble(matrix(ncol = num + 1,
dimnames = list(NULL, paste0("col", seq_len(num + 1)))))
for(i in 1:nrow(prev_set)){
for(j in 1:nrow(names)){
if(prev_set[i,num] == names[j,1]){
chain[1,][1:num] = prev_set[i,][1:num]
chain[1,num+1] = names[j,2]
this_set <- rbind(this_set, chain)
}
}
}
return(this_set)
}
Now, build the name chains sets.
# final_names <- readRDS(here("save", "NBA_ABA_final_names.rds"))
# reasonable starting points: the people whose last name is someone else's first name
chains_00 <- tibble()
for(i in 1:nrow(final_names)){
if(final_names[i,2] %in% final_names$first){
chains_00 <- bind_rows(chains_00, final_names[i,])
}
}
# chains of two people
step_01 <- final_names %>% filter(first %in% last) # here, we shorten the process slightly
# by only searching through the people
# whose first name
# is someone else's last name
chains_01 <- create_chains(chains_00, step_01, 2)
# chains of three people
step_02 <- final_names %>% filter(first %in% step_01$last)
chains_02 <- create_chains(chains_01, step_02, 3)
# chains of four people
step_03 <- final_names %>% filter(first %in% step_02$last)
chains_03 <- create_chains(chains_02, step_03, 4)
# chains of five people
step_04 <- final_names %>% filter(first %in% step_03$last)
chains_04 <- create_chains(chains_03, step_04, 5)
# chains of six people
step_05 <- final_names %>% filter(first %in% step_04$last)
chains_05 <- create_chains(chains_04, step_05, 6)
# chains of seven people
step_06 <- final_names %>% filter(first %in% step_05$last)
chains_06 <- create_chains(chains_05, step_06, 7)
# chains of eight people
step_07 <- final_names %>% filter(first %in% step_06$last)
chains_07 <- create_chains(chains_06, step_07, 8)
# chains of nine people
step_08 <- final_names %>% filter(first %in% step_07$last)
chains_08 <- create_chains(chains_07, step_08, 9)
# chains of ten people
step_09 <- final_names %>% filter(first %in% step_08$last)
chains_09 <- create_chains(chains_08, step_09, 10)
# chains of eleven people
step_10 <- final_names %>% filter(first %in% step_09$last)
chains_10 <- create_chains(chains_09, step_10, 11)
# chains of twelve people
step_11 <- final_names %>% filter(first %in% step_10$last)
chains_11 <- create_chains(chains_10, step_11, 12)
# chains of thirteen people
step_12 <- final_names %>% filter(first %in% step_11$last)
chains_12 <- create_chains(chains_11, step_12, 13)
# chains of fourteen people
step_13 <- final_names %>% filter(first %in% step_12$last)
chains_13 <- create_chains(chains_12, step_13, 14)
# chains of fifteen people
step_14 <- final_names %>% filter(first %in% step_13$last) # final step!
chains_14 <- create_chains(chains_13, step_14, 15)
There are two (2) name chains with fifteen people:
## Tellis Frank Brian Howard Carl Henry James Allen Terry Thomas Brandon Paul George Wilson Chandler Hutchinson
## Tellis Frank Brian Howard Carl Henry James Allen Terry Thomas Brandon Paul George Wilson Chandler Parsons
The final product: bind all the sets of name chains together!
# the full list of name chains - bind all the chains tibbles together
all_name_chains <- list(chains_01, chains_02, chains_03, chains_04, chains_05, chains_06, chains_07,
chains_08, chains_09, chains_10, chains_11, chains_12, chains_13, chains_14) %>%
bind_rows()
# save locally
saveRDS(all_name_chains, here("save", "NBA_ABA_name_chains.rds"))