library(tidyverse)
library(tidymodels)
library(zoo)
library(bonsai)
library(hoopR)
set.seed(1234)
8 Making predictions with new games
And we’ve arrived at tournament time. Now we’re going to take all this modeling and apply it to new games. To do this, it takes a few extra steps that only make sense after you’ve done them.
Those steps are:
- We’re going to build our models the way we have been doing so all along. We need a one game lag to mimic not knowing the stats before the game. But now that we’ve worked with them this much, we should have an idea of what model works best for us, so we’re only going to do that one model. We don’t need multiple. We’ll stop at the fit.
- After we have a fit, we have to redo our feature engineering but this time without all the lags. We’re now at a point where the data we have is what we need. No more shifting it back one game because we’re officially in the future.
- The, we have to go through a process of joining to get the two teams we need for a particular matchup.
- Once we have our games, we can apply the fit to them and get a prediction.
It seems like a lot, and in terms of lines of code, it is. But it’s really a lot of copy paste work. Let’s get started.
We’ll continue to use what we’ve done for feature engineering – a rolling window of points per possession for team and opponent. You should be quite familiar with this by now.
<- load_mbb_team_box(seasons = 2015:2024)
teamgames
<- teamgames |>
teamstats mutate(
possessions = field_goals_attempted - offensive_rebounds + turnovers + (.475 * free_throws_attempted),
ppp = team_score/possessions,
oppp = opponent_team_score/possessions
)
<- teamstats |>
rollingteamstats group_by(team_short_display_name, season) |>
arrange(game_date) |>
mutate(
team_rolling_ppp = rollmean(lag(ppp, n=1), k=5, align="right", fill=NA),
team_rolling_oppp = rollmean(lag(oppp, n=1), k=5, align="right", fill=NA)
|>
) ungroup()
<- rollingteamstats |>
team_side select(
game_id,
team_id,
team_short_display_name,
opponent_team_id,
game_date,
season,
team_score,
team_rolling_ppp,
team_rolling_oppp
)
<- team_side |>
opponent_side select(-opponent_team_id) |>
rename(
opponent_team_id = team_id,
opponent_short_display_name = team_short_display_name,
opponent_score = team_score,
opponent_rolling_ppp = team_rolling_ppp,
opponent_rolling_oppp = team_rolling_oppp
|>
) mutate(opponent_id = as.numeric(opponent_team_id)
)
<- team_side |> inner_join(opponent_side)
games
<- games |> mutate(
games team_result = as.factor(case_when(
> opponent_score ~ "W",
team_score > team_score ~ "L"
opponent_score
)))
$team_result <- relevel(games$team_result, ref="W")
games
<- games |>
modelgames select(
game_id,
game_date,
team_short_display_name,
opponent_short_display_name,
season,
team_rolling_ppp,
team_rolling_oppp,
opponent_rolling_ppp,
opponent_rolling_oppp,
team_result|>
) na.omit()
We’re going to go through the steps of modeling again, starting with splitting our modelgames
data and going all the way down to the fit. For your notebooks, you only need one model and one fit, so it’s time to clean it all up.
<- initial_split(modelgames, prop = .8)
game_split <- training(game_split)
game_train <- testing(game_split)
game_test
<-
game_recipe recipe(team_result ~ ., data = game_train) |>
update_role(game_id, game_date, team_short_display_name, opponent_short_display_name, season, new_role = "ID") |>
step_normalize(all_predictors())
<-
svm_mod svm_poly() |>
set_engine("kernlab") |>
set_mode("classification")
<-
svm_workflow workflow() |>
add_model(svm_mod) |>
add_recipe(game_recipe)
<-
svm_fit |>
svm_workflow fit(data = game_train)
Setting default kernel parameters
8.1 Redoing the feature engineering
Now that we have our fit based on known data, we need to redo our feature engineering so that we now have up to the moment data. You get that by just simply removing the lags. Make sure you remove the lag( parts AND the n=1) parts both.
<- teamstats |>
rollingteamstats group_by(team_short_display_name, season) |>
arrange(game_date) |>
mutate(
team_rolling_ppp = rollmean(ppp, k=5, align="right", fill=NA),
team_rolling_oppp = rollmean(oppp, k=5, align="right", fill=NA),
|>
) ungroup()
<- rollingteamstats |>
team_side select(
game_id,
team_id,
team_short_display_name,
opponent_team_id,
game_date,
season,
team_score,
team_rolling_ppp,
team_rolling_oppp
)
<- team_side |>
opponent_side select(-opponent_team_id) |>
rename(
opponent_team_id = team_id,
opponent_short_display_name = team_short_display_name,
opponent_score = team_score,
opponent_rolling_ppp = team_rolling_ppp,
opponent_rolling_oppp = team_rolling_oppp
|>
) mutate(opponent_id = as.numeric(opponent_team_id)
)
<- team_side |> inner_join(opponent_side) games
Joining with `by = join_by(game_id, opponent_team_id, game_date, season)`
<- games |> mutate(
games team_result = as.factor(case_when(
> opponent_score ~ "W",
team_score > team_score ~ "L"
opponent_score
)))
$team_result <- relevel(games$team_result, ref="W")
games
<- games |>
modelgames select(
game_id,
game_date,
team_short_display_name,
opponent_short_display_name,
season,
team_rolling_ppp,
team_rolling_oppp,
opponent_rolling_ppp,
opponent_rolling_oppp,
team_result|>
) na.omit()
Now we need to get the first games. Let’s start with the first round of last seaons’s Big Ten Tournament – we’ll pretend it’s happening today and the results are the same. To do this, we’re first going to make a tibble with the teams in the team_short_display_name
and opponnent_short_display_name
.
<- tibble(
round1games team_short_display_name="Ohio State",
opponent_short_display_name="Wisconsin"
|> add_row(
) team_short_display_name="Minnesota",
opponent_short_display_name="Nebraska"
)
Now with that, we need to get all the team data for our game and join it to our round 1 games. This will get the latest information, drop the team_result
and all the opponent information and then add it to our round1games dataframe. Then it will do it again, but this time for the opponent side of the game.
<- modelgames |>
round1games group_by(team_short_display_name) |>
filter(game_date == max(game_date) & season == 2024) |>
ungroup() |>
select(-team_result, -starts_with("opponent")) |>
right_join(round1games)
Joining with `by = join_by(team_short_display_name)`
<- modelgames |>
round1games group_by(opponent_short_display_name) |>
filter(game_date == max(game_date) & season == 2024) |>
ungroup() |>
select(-team_result, -starts_with("team"), -game_id, -game_date, -season) |>
right_join(round1games)
Joining with `by = join_by(opponent_short_display_name)`
Now, just like before, we apply our fits. The select at the end is just to move the right stuff up to the front of the table and make it easy for us to see what we need to see.
And who does our model think will win the first round?
<- svm_fit |> predict(new_data = round1games) |>
round1 bind_cols(round1games) |> select(.pred_class, team_short_display_name, opponent_short_display_name, everything())
<- svm_fit |> predict(new_data = round1games, type="prob") |>
round1 bind_cols(round1) |> select(.pred_class, .pred_W, .pred_L, team_short_display_name, opponent_short_display_name, everything())
round1
# A tibble: 2 × 12
.pred_class .pred_W .pred_L team_short_display_name opponent_short_display_n…¹
<fct> <dbl> <dbl> <chr> <chr>
1 L 0.321 0.679 Ohio State Wisconsin
2 W 0.516 0.484 Minnesota Nebraska
# ℹ abbreviated name: ¹opponent_short_display_name
# ℹ 7 more variables: opponent_rolling_ppp <dbl>, opponent_rolling_oppp <dbl>,
# game_id <int>, game_date <date>, season <int>, team_rolling_ppp <dbl>,
# team_rolling_oppp <dbl>