predicting the outcome of cage ﬁghts using r · 2019-05-10 · chin the more a ﬁghter gets...

Predicting the outcome of cage fights using R

Warning: violence

Background

Rules of MMA

http://www.youtube.com/watch?v=l8TmBfS6ghE&t=288

http://www.youtube.com/watch?v=iY-oIgLru7k&t=64

http://www.youtube.com/watch?v=stc_Ax-exPw&t=395

library(tidyverse)

library(rvest)

# Function to scrape fights info for each fighter

scrape_fighters <- function(x,y) {

x %>%

read_html() %>%

html_nodes("table.wikitable") %>% # Extract each table on the wiki page

map(html_table, fill = T) %>% # Convert each table to df

keep(~length(names(.x)) == 10) %>% # Keep only the table containing fights

keep(~colnames(.x)[1] == "Res.") %>% # Defined by having 10 columns and the first column names ‘Res.’

bind_rows() %>%

mutate(fighter = y, # Add a new column with the fighter

Round = as.character(Round))

Sys.sleep(1) # So as not to bombard the website

}

library(purrr)

library(dplyr)

library(rvest)

list_of_fighters <- read_csv("./Data/fighterlinksvect.")

# For each fighter wiki page, pull out the table with their mma record

fighter_records <- map2(list_of_fighters$wiki_link,

list_of_fighters$name,

safely(scrape_fighters))

# There will be errors which need to be looked at

fighter_records %>%

pluck("error") %>%

compact()

# Save the result as a data frame

fighter_records_complete <- fighter_records %>%

pluck("result") %>%

bind_rows()

Hypothesis

ChinThe more a fighter gets knocked out, the easier they are to knock out in future

“That guy’s chin is shot”

Mark Hunt in 2005….

http://www.youtube.com/watch?v=MjzAc9vNPf4&t=540

Mark Hunt in 2013 (after having been KO’d a few times)

http://www.youtube.com/watch?v=NsdPdV27EyA&t=8

# a simpler method using base lappplylapply(mtcars, mean) %>% bind_cols()

# the purrr version of lapplymap(mtcars, mean) %>% bind_cols()

# shorthand for binding colsmap_dfr(mtcars, mean)

# parallelised mappinglibrary(furrr)plan(multiprocess)

future_map_dfr(mtcars, mean)

## # A tibble: 1 x 11## mpg cyl disp hp drat wt qsec vs am gear carb## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81

gen_past_kos <- function(date_input, fighter_input) {

df <- fights_full %>%

mutate(fighter = tolower(trimws(fighter))) %>%

filter(fighter == fighter_input,

date <= date_input) %>%

arrange(desc(date))

data.frame(

n_past_kos = sum(df$result == "Loss" &

df$method == "TKO/KO"),

fight_result = df %>%

head(1) %>%

select(result) %>%

pull() )

}

library(furrr); plan(multiprocess)

past_kos <- future_map2(fights_full$date,

fights_full$fighter,

safely(gen_past_kos),

.progress = T)Number of past KOs losses going into the fight

Past knockout losses Vs proportion of fights won

# past KO losses

fights_full %>%

arrange(fighter, date) %>%

group_by(fighter) %>%

transmute(past_ko_losses = cumsum(result == "Loss" &

method == "TKO/KO")) %>%

bind_cols(fights_full %>%

arrange(fighter, date) %>%

select(result, date, method)) %>%

group_by(past_ko_losses) %>%

summarise(win_prop = sum(result == "Win") / n()) %>%

ggplot(aes(x = past_ko_losses, y = win_prop)) +

geom_point() +

geom_smooth() +

ggtitle("Past knockout losses Vs Proportion of fights won") +

xlab("Number of past KOs losses going into the fight") +

ylab("Win Proportion") +

xlim(0,15)

AgeThe older a fighter gets, the slower they are and the easier they are to get knocked out or submitted

“That guy is past his prime”

http://www.youtube.com/watch?v=7aRMbtwFK-c&t=112

http://www.youtube.com/watch?v=SKkCB_lkrlg

static_stats %>%

group_by(fighter_age = round(fighter_age)) %>%

summarise(win_proportion = sum(win_loss == 1,

na.rm = T) /

n()) %>%

ggplot(aes(x = fighter_age,

y = win_proportion)) +

geom_point() +

geom_smooth(se = F) +

ggtitle("Age of MMA fighters vs. likelihood of winning") +

xlim(17, 40)

Age of MMA fighters Vs likelihood of winning their fight

If we predicted just using age,

we would be accurate 56% of the time

ReachThe longer a fighters arms, the more chance of them being able to attack from a safe distance

“He fights standing just outside the pocket”

Mark Hunt in 2005….

http://www.youtube.com/watch?v=IJGF6Wl4_w8&t=230

Framing the Problem

Regression problem?

P fighter 1 winning

= fighter 1 reach +

fighter 1 age +

fighter 1 past KO losses +

fighter 2 reach +

fighter 2 age +

fighter 2 past KO losses +

…...

Win streak

Home town / country advantage

Past submission wins

Past submission losses

Number of rounds in fight

Title defence

Style (wrestling, BJJ, boxing)

library(rsample)

library(ranger)

x <- initial_split(model_data, prop = 0.8)

train <- training(x)

test <- testing(x)

# scaling attributes

train_scaled_attrs <- train %>%

select(-win_loss) %>%

keep(is.numeric) %>%

scale()

# scale train and test set (after removing dep var and character vars)

train_scaled <- train %>%


select_if(is.numeric) %>%

scale(

attr(train_scaled_attrs, "scaled:center"),

attr(train_scaled_attrs, "scaled:scale")

)

test_scaled <- test %>%


select_if(is.numeric) %>%

scale(

attr(train_scaled_attrs, "scaled:center"),

attr(train_scaled_attrs, "scaled:scale")

)

# the IDs for use afterwards - includes dep var and non numeric

train_ids <-

train %>% select_if(function(col)

! is.numeric(col) | all(col == .$win_loss))

test_ids <-

test %>% select_if(function(col)

! is.numeric(col) | all(col == .$win_loss))

# variables to model with

train_data <- train_scaled %>%

data.frame() %>%

select_if(function(x)

all(!is.na(x))) %>%

bind_cols(train_ids)

test_data <- test_scaled %>%

data.frame() %>%

bind_cols(test_ids) %>%

select(one_of(names(train_data)))

# Random Forest model

rf_mod <-

ranger(win_loss ~ . - fighter - opponent - date

,data = train_data

,importance = 'impurity')

preds <- predict(rf_mod_bin, test_data)

confusionMatrix(as.factor(test_data$win_loss), preds$predictions)

Network problem?

Time series?library(tidyverse)

library(geomnet)

network_graph <- df %>%

ggplot(aes(from_id = fighter,

to_id = opponent)) +

geom_net(

aes(colour = weight_class),

layout.alg = "fruchtermanreingold",

size = 1,

labelon = TRUE,

vjust = -0.6,

ecolour = "grey60",

directed = FALSE,

fontsize = 1,

ealpha = 0.5

) +

theme_net() +

theme(legend.position = "bottom")

Time series problem?library(PlayerRatings)

library(lubridate)

library(tidyverse)

start_date <- as.Date("2000-01-01")

input <- fights_full %>%

distinct() %>%

filter(date > start_date) %>%

select(date, result, fighter, opponent) %>%

transmute(

weeks_since_start = time_length(difftime(date, start_date), "weeks"),

fighter,

opponent,

result = case_when(result == "Win" ~ 1,

result == "Loss" ~ 0,

TRUE ~ 0.5)

)

elo(input, history = T)

Time series problem?

P fighter winning

= f(Body strikes blocked in last n fights ,

Opponent body strikes attempted in last n,

Attempted takedowns in last n,

Opponent takedowns defended in last n,

Average fight time of past n fights,

Win streak,

Days since last KO loss,

……)

generate_past_statistics <- function(date_input, fighter_input, fighter_or_opponent = "fighter") { # filter the fights dataframe to show only the fighter in question and data up to their last fight df_ <- fights_full %>%

mutate(fighter = tolower(trimws(fighter))) %>%dplyr::filter(fighter == fighter_input,

date < date_input) %>% arrange(desc(date))

#check if the fighter or opponent names are in the df if(nrow(df_) == 0) {return(NULL)} # number of wins and losses wins_losses <- fights_full %>%

mutate(fighter = tolower(trimws(fighter))) %>%dplyr::filter(fighter == fighter_input,

date <= date_input) %>% dplyr::filter(row_number() == 1) %>%select(wins, losses)

input_prop_win_loss <- list(lookback = lookback_prop_win_loss, #c(2, 4, 6, 8, 16, 32), metric = c("Loss", "Win"))

# map function over each element - should I NA values that are higher than the number of fights? prop_win_loss <- input_prop_win_loss %>%

cross_df() %>%pmap_dfc(prop_x_in_last)

# create list of all posible inputs input_win_loss_history <- list( lookback = lookback_n_x_in_last, metric = c("Loss", "Win"), method = c("TKO/KO", "Decision") )

# map function over each element win_loss_history <- input_win_loss_history %>%

cross_df() %>%pmap_dfc(n_x_in_last)

# strike history input_strike_history <- list( lookback = lookback_strike_history, method = c("TKO/KO"), metric = c("Loss", "Win") # sig strikes, td, etc ) # map function over each element strike_history <- input_strike_history %>%

cross_df() %>%pmap_dfc(n_x_in_last)

# calculate the average strikes of different types in the last n matches - n can be tuned input_fighter_round_vars <- select(fight_statistics, kd:takedown_accuracy) %>% names() mean_strikes_dealt <- fight_statistics %>%

filter(fighter == fighter_input, date < date_input) %>%

arrange(desc(date)) %>%filter(row_number() <= lookback_strike_history) %>%summarise_at(input_fighter_round_vars, mean, na.rm = T) %>%set_names(paste0("mean_", colnames(.), "_of_last_", lookback_strike_history))

# calculate the strikes of differnt types absorbed through the career (30 fights) ## select(df, opponent_kd:opponent_takedown_accuracy) %>% names() input_opponent_round_vars <- df_%>% select(contains("opponent_")) %>% select(contains("_lnd")) %>% names() new_names <- str_replace_all(input_opponent_round_vars, c("opponent_" = "", "_lnd" = "")) strikes_absorbed <- mean_x_in_last(30, input_opponent_round_vars) %>% set_names(paste0(new_names, "_absorbed")) ### CURRENT WIN STREAK # rle list the contents of vector and the order and frequency that the values occour ws_ls <- rle(as.character(df_$result)) # if if (is.na(ws_ls$values[1])) { loss_streak <- 0 win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Loss") { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Win") { loss_streak <- 0 win_streak <- ws_ls$lengths[1] other_streak <- 0 } else { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 }

# combine win or loss or other into data frame win_loss_streak <- data.frame( loss_streak = loss_streak, win_streak = win_streak, other_streak = other_streak ) ### DAYS SINCE LAST KO LOSS last_ko_loss <- df_%>%

filter(result == "Loss", method == "TKO/KO", dplyr::row_number() == 1) # if the fighter has never been KO'd, put the days in as 1000 if (nrow(last_ko_loss) == 0) { days_since_last_ko <- data.frame(days_since_last_ko_loss = 1000) } else { days_since_last_ko <- last_ko_loss %>% transmute(days_since_last_ko_loss = as.Date(format(as.Date(date_input, origin="1970-01-01"))) - date) %>% mutate(days_since_last_ko_loss = as.numeric(days_since_last_ko_loss), days_since_last_ko_loss = ifelse(days_since_last_ko_loss > 1000, 1000, days_since_last_ko_loss)) } #LAST FIGHT RESULT last_fight_result <- df_%>%

filter(row_number() == 1) %>%transmute(last_fight_result = ifelse(result == "Win", 1, 0))

#AVERAGE FIGHT TIME average_fight_time <- df_%>%

summarise( average_fight_time = mean(total_fight_time_seconds), total_fight_time = sum(total_fight_time_seconds), total_fights = n() ) # AGE Age <- df_%>%

filter(row_number() == 1) %>%transmute(fighter_age = date_input)

# TOTAL FIGHTS total_fights <- df_%>% summarise(total_fights = n()) # Performance Bonuses n_perf_bonuses <- df_%>% summarise(n_perf_bonuses = sum(performance_bonus)) # the variables will sometimes be blank if there is no fight history - replace with NAs if so first_fight_NA <- function(variable) { if(nrow(variable) == 0) { variable[1,] <- NA return(variable) } else { return(variable) } }

# apply clean up function and bind everythign together map_dfc( list(data.frame(fighter = fighter_input), data.frame(date = date_input), wins_losses, total_fights, prop_win_loss, win_loss_history, win_loss_streak, days_since_last_ko, last_fight_result, n_perf_bonuses, average_fight_time, mean_strikes_dealt, strikes_absorbed ), .f = first_fight_NA) %>%

set_names(paste(fighter_or_opponent, colnames(.), sep = "_")) }

library(furrr)plan(multiprocess)

fighter_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$fighter, fighter_or_opponent = "fighter" ), generate_past_statistics, .progress = T)

opponent_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$opponent, fighter_or_opponent = "opponent" ), generate_past_statistics, .progress = T)

# long winded for loop to extract column means

out <- list()

for (i in 1:ncol(mtcars)) {

out[i] <- mean(mtcars[,i])

}

names(out) <- colnames(mtcars)

bind_rows(out)

## # A tibble: 1 x 11

## mpg cyl disp hp drat wt qsec vs am gear carb

## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>

## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81

# a simpler method using base lappply

lapply(mtcars, mean) %>% bind_cols()

# the purrr version of lapply

map(mtcars, mean) %>% bind_cols()

# shorthand for binding cols

map_dfr(mtcars, mean)

# parallelised mapping

library(furrr)

plan(multiprocess)

future_map_dfr(mtcars, mean)

## # A tibble: 1 x 11

## mpg cyl disp hp drat wt qsec vs am gear carb

## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>

## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81

Time series?library(keras)

install_keras()

model <- keras_model_sequential() %>%

layer_dense(units = 256, activation = 'relu', input_shape = ncol(train_x_matrix)) %>%

layer_dropout(rate = 0.4) %>%

layer_dense(units = 128, activation = 'relu') %>%

layer_dropout(rate = 0.4) %>%

layer_dense(units = 1, activation = 'sigmoid')

model %>% compile(

loss = "binary_crossentropy",

optimizer = "adam",

metrics = list("accuracy")

)

model %>% fit(

train_x_matrix,

select(train_data, win_loss) %>% pull(),

epochs = 15,

batch_size = 3,

validation_split = 0.3)

Using the model to predict future outcomes

(and make money??)

predicting the outcome of cage ﬁghts using r · 2019-05-10 · chin the more a ﬁghter gets...

Documents