predicting the outcome of cage fights using r · 2019-05-10 · chin the more a fighter gets...
TRANSCRIPT
Predicting the outcome of cage fights using R
Warning: violence
Background
Rules of MMA
Data
library(tidyverse)
library(rvest)
# Function to scrape fights info for each fighter
scrape_fighters <- function(x,y) {
x %>%
read_html() %>%
html_nodes("table.wikitable") %>% # Extract each table on the wiki page
map(html_table, fill = T) %>% # Convert each table to df
keep(~length(names(.x)) == 10) %>% # Keep only the table containing fights
keep(~colnames(.x)[1] == "Res.") %>% # Defined by having 10 columns and the first column names ‘Res.’
bind_rows() %>%
mutate(fighter = y, # Add a new column with the fighter
Round = as.character(Round))
Sys.sleep(1) # So as not to bombard the website
}
library(purrr)
library(dplyr)
library(rvest)
list_of_fighters <- read_csv("./Data/fighterlinksvect.")
# For each fighter wiki page, pull out the table with their mma record
fighter_records <- map2(list_of_fighters$wiki_link,
list_of_fighters$name,
safely(scrape_fighters))
# There will be errors which need to be looked at
fighter_records %>%
pluck("error") %>%
compact()
# Save the result as a data frame
fighter_records_complete <- fighter_records %>%
pluck("result") %>%
bind_rows()
Hypothesis
ChinThe more a fighter gets knocked out, the easier they are to knock out in future
“That guy’s chin is shot”
Mark Hunt in 2005….
Mark Hunt in 2013 (after having been KO’d a few times)
# a simpler method using base lappplylapply(mtcars, mean) %>% bind_cols()
# the purrr version of lapplymap(mtcars, mean) %>% bind_cols()
# shorthand for binding colsmap_dfr(mtcars, mean)
# parallelised mappinglibrary(furrr)plan(multiprocess)
future_map_dfr(mtcars, mean)
## # A tibble: 1 x 11## mpg cyl disp hp drat wt qsec vs am gear carb## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81
gen_past_kos <- function(date_input, fighter_input) {
df <- fights_full %>%
mutate(fighter = tolower(trimws(fighter))) %>%
filter(fighter == fighter_input,
date <= date_input) %>%
arrange(desc(date))
data.frame(
n_past_kos = sum(df$result == "Loss" &
df$method == "TKO/KO"),
fight_result = df %>%
head(1) %>%
select(result) %>%
pull() )
}
library(furrr); plan(multiprocess)
past_kos <- future_map2(fights_full$date,
fights_full$fighter,
safely(gen_past_kos),
.progress = T)Number of past KOs losses going into the fight
Past knockout losses Vs proportion of fights won
# past KO losses
fights_full %>%
arrange(fighter, date) %>%
group_by(fighter) %>%
transmute(past_ko_losses = cumsum(result == "Loss" &
method == "TKO/KO")) %>%
bind_cols(fights_full %>%
arrange(fighter, date) %>%
select(result, date, method)) %>%
group_by(past_ko_losses) %>%
summarise(win_prop = sum(result == "Win") / n()) %>%
ggplot(aes(x = past_ko_losses, y = win_prop)) +
geom_point() +
geom_smooth() +
ggtitle("Past knockout losses Vs Proportion of fights won") +
xlab("Number of past KOs losses going into the fight") +
ylab("Win Proportion") +
xlim(0,15)
AgeThe older a fighter gets, the slower they are and the easier they are to get knocked out or submitted
“That guy is past his prime”
static_stats %>%
group_by(fighter_age = round(fighter_age)) %>%
summarise(win_proportion = sum(win_loss == 1,
na.rm = T) /
n()) %>%
ggplot(aes(x = fighter_age,
y = win_proportion)) +
geom_point() +
geom_smooth(se = F) +
ggtitle("Age of MMA fighters vs. likelihood of winning") +
xlim(17, 40)
Age of MMA fighters Vs likelihood of winning their fight
If we predicted just using age,
we would be accurate 56% of the time
ReachThe longer a fighters arms, the more chance of them being able to attack from a safe distance
“He fights standing just outside the pocket”
Mark Hunt in 2005….
Framing the Problem
Regression problem?
P fighter 1 winning
= fighter 1 reach +
fighter 1 age +
fighter 1 past KO losses +
fighter 2 reach +
fighter 2 age +
fighter 2 past KO losses +
…...
Win streak
Home town / country advantage
Past submission wins
Past submission losses
Number of rounds in fight
Title defence
Style (wrestling, BJJ, boxing)
library(rsample)
library(ranger)
x <- initial_split(model_data, prop = 0.8)
train <- training(x)
test <- testing(x)
# scaling attributes
train_scaled_attrs <- train %>%
select(-win_loss) %>%
keep(is.numeric) %>%
scale()
# scale train and test set (after removing dep var and character vars)
train_scaled <- train %>%
select(-win_loss) %>%
select_if(is.numeric) %>%
scale(
attr(train_scaled_attrs, "scaled:center"),
attr(train_scaled_attrs, "scaled:scale")
)
test_scaled <- test %>%
select(-win_loss) %>%
select_if(is.numeric) %>%
scale(
attr(train_scaled_attrs, "scaled:center"),
attr(train_scaled_attrs, "scaled:scale")
)
# the IDs for use afterwards - includes dep var and non numeric
train_ids <-
train %>% select_if(function(col)
! is.numeric(col) | all(col == .$win_loss))
test_ids <-
test %>% select_if(function(col)
! is.numeric(col) | all(col == .$win_loss))
# variables to model with
train_data <- train_scaled %>%
data.frame() %>%
select_if(function(x)
all(!is.na(x))) %>%
bind_cols(train_ids)
test_data <- test_scaled %>%
data.frame() %>%
bind_cols(test_ids) %>%
select(one_of(names(train_data)))
# Random Forest model
rf_mod <-
ranger(win_loss ~ . - fighter - opponent - date
,data = train_data
,importance = 'impurity')
preds <- predict(rf_mod_bin, test_data)
confusionMatrix(as.factor(test_data$win_loss), preds$predictions)
Network problem?
Time series?library(tidyverse)
library(geomnet)
network_graph <- df %>%
ggplot(aes(from_id = fighter,
to_id = opponent)) +
geom_net(
aes(colour = weight_class),
layout.alg = "fruchtermanreingold",
size = 1,
labelon = TRUE,
vjust = -0.6,
ecolour = "grey60",
directed = FALSE,
fontsize = 1,
ealpha = 0.5
) +
theme_net() +
theme(legend.position = "bottom")
Time series problem?library(PlayerRatings)
library(lubridate)
library(tidyverse)
start_date <- as.Date("2000-01-01")
input <- fights_full %>%
distinct() %>%
filter(date > start_date) %>%
select(date, result, fighter, opponent) %>%
transmute(
weeks_since_start = time_length(difftime(date, start_date), "weeks"),
fighter,
opponent,
result = case_when(result == "Win" ~ 1,
result == "Loss" ~ 0,
TRUE ~ 0.5)
)
elo(input, history = T)
Time series problem?
P fighter winning
= f(Body strikes blocked in last n fights ,
Opponent body strikes attempted in last n,
Attempted takedowns in last n,
Opponent takedowns defended in last n,
Average fight time of past n fights,
Win streak,
Days since last KO loss,
……)
generate_past_statistics <- function(date_input, fighter_input, fighter_or_opponent = "fighter") { # filter the fights dataframe to show only the fighter in question and data up to their last fight df_ <- fights_full %>%
mutate(fighter = tolower(trimws(fighter))) %>%dplyr::filter(fighter == fighter_input,
date < date_input) %>% arrange(desc(date))
#check if the fighter or opponent names are in the df if(nrow(df_) == 0) {return(NULL)} # number of wins and losses wins_losses <- fights_full %>%
mutate(fighter = tolower(trimws(fighter))) %>%dplyr::filter(fighter == fighter_input,
date <= date_input) %>% dplyr::filter(row_number() == 1) %>%select(wins, losses)
input_prop_win_loss <- list(lookback = lookback_prop_win_loss, #c(2, 4, 6, 8, 16, 32), metric = c("Loss", "Win"))
# map function over each element - should I NA values that are higher than the number of fights? prop_win_loss <- input_prop_win_loss %>%
cross_df() %>%pmap_dfc(prop_x_in_last)
# create list of all posible inputs input_win_loss_history <- list( lookback = lookback_n_x_in_last, metric = c("Loss", "Win"), method = c("TKO/KO", "Decision") )
# map function over each element win_loss_history <- input_win_loss_history %>%
cross_df() %>%pmap_dfc(n_x_in_last)
# strike history input_strike_history <- list( lookback = lookback_strike_history, method = c("TKO/KO"), metric = c("Loss", "Win") # sig strikes, td, etc ) # map function over each element strike_history <- input_strike_history %>%
cross_df() %>%pmap_dfc(n_x_in_last)
# calculate the average strikes of different types in the last n matches - n can be tuned input_fighter_round_vars <- select(fight_statistics, kd:takedown_accuracy) %>% names() mean_strikes_dealt <- fight_statistics %>%
filter(fighter == fighter_input, date < date_input) %>%
arrange(desc(date)) %>%filter(row_number() <= lookback_strike_history) %>%summarise_at(input_fighter_round_vars, mean, na.rm = T) %>%set_names(paste0("mean_", colnames(.), "_of_last_", lookback_strike_history))
# calculate the strikes of differnt types absorbed through the career (30 fights) ## select(df, opponent_kd:opponent_takedown_accuracy) %>% names() input_opponent_round_vars <- df_%>% select(contains("opponent_")) %>% select(contains("_lnd")) %>% names() new_names <- str_replace_all(input_opponent_round_vars, c("opponent_" = "", "_lnd" = "")) strikes_absorbed <- mean_x_in_last(30, input_opponent_round_vars) %>% set_names(paste0(new_names, "_absorbed")) ### CURRENT WIN STREAK # rle list the contents of vector and the order and frequency that the values occour ws_ls <- rle(as.character(df_$result)) # if if (is.na(ws_ls$values[1])) { loss_streak <- 0 win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Loss") { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Win") { loss_streak <- 0 win_streak <- ws_ls$lengths[1] other_streak <- 0 } else { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 }
# combine win or loss or other into data frame win_loss_streak <- data.frame( loss_streak = loss_streak, win_streak = win_streak, other_streak = other_streak ) ### DAYS SINCE LAST KO LOSS last_ko_loss <- df_%>%
filter(result == "Loss", method == "TKO/KO", dplyr::row_number() == 1) # if the fighter has never been KO'd, put the days in as 1000 if (nrow(last_ko_loss) == 0) { days_since_last_ko <- data.frame(days_since_last_ko_loss = 1000) } else { days_since_last_ko <- last_ko_loss %>% transmute(days_since_last_ko_loss = as.Date(format(as.Date(date_input, origin="1970-01-01"))) - date) %>% mutate(days_since_last_ko_loss = as.numeric(days_since_last_ko_loss), days_since_last_ko_loss = ifelse(days_since_last_ko_loss > 1000, 1000, days_since_last_ko_loss)) } #LAST FIGHT RESULT last_fight_result <- df_%>%
filter(row_number() == 1) %>%transmute(last_fight_result = ifelse(result == "Win", 1, 0))
#AVERAGE FIGHT TIME average_fight_time <- df_%>%
summarise( average_fight_time = mean(total_fight_time_seconds), total_fight_time = sum(total_fight_time_seconds), total_fights = n() ) # AGE Age <- df_%>%
filter(row_number() == 1) %>%transmute(fighter_age = date_input)
# TOTAL FIGHTS total_fights <- df_%>% summarise(total_fights = n()) # Performance Bonuses n_perf_bonuses <- df_%>% summarise(n_perf_bonuses = sum(performance_bonus)) # the variables will sometimes be blank if there is no fight history - replace with NAs if so first_fight_NA <- function(variable) { if(nrow(variable) == 0) { variable[1,] <- NA return(variable) } else { return(variable) } }
# apply clean up function and bind everythign together map_dfc( list(data.frame(fighter = fighter_input), data.frame(date = date_input), wins_losses, total_fights, prop_win_loss, win_loss_history, win_loss_streak, days_since_last_ko, last_fight_result, n_perf_bonuses, average_fight_time, mean_strikes_dealt, strikes_absorbed ), .f = first_fight_NA) %>%
set_names(paste(fighter_or_opponent, colnames(.), sep = "_")) }
library(furrr)plan(multiprocess)
fighter_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$fighter, fighter_or_opponent = "fighter" ), generate_past_statistics, .progress = T)
opponent_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$opponent, fighter_or_opponent = "opponent" ), generate_past_statistics, .progress = T)
# long winded for loop to extract column means
out <- list()
for (i in 1:ncol(mtcars)) {
out[i] <- mean(mtcars[,i])
}
names(out) <- colnames(mtcars)
bind_rows(out)
## # A tibble: 1 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81
# a simpler method using base lappply
lapply(mtcars, mean) %>% bind_cols()
# the purrr version of lapply
map(mtcars, mean) %>% bind_cols()
# shorthand for binding cols
map_dfr(mtcars, mean)
# parallelised mapping
library(furrr)
plan(multiprocess)
future_map_dfr(mtcars, mean)
## # A tibble: 1 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81
Time series?library(keras)
install_keras()
model <- keras_model_sequential() %>%
layer_dense(units = 256, activation = 'relu', input_shape = ncol(train_x_matrix)) %>%
layer_dropout(rate = 0.4) %>%
layer_dense(units = 128, activation = 'relu') %>%
layer_dropout(rate = 0.4) %>%
layer_dense(units = 1, activation = 'sigmoid')
model %>% compile(
loss = "binary_crossentropy",
optimizer = "adam",
metrics = list("accuracy")
)
model %>% fit(
train_x_matrix,
select(train_data, win_loss) %>% pull(),
epochs = 15,
batch_size = 3,
validation_split = 0.3)
Using the model to predict future outcomes
(and make money??)