8  Making predictions with new games

And we’ve arrived at tournament time. Now we’re going to take all this modeling and apply it to new games. To do this, it takes a few extra steps that only make sense after you’ve done them.

Those steps are:

  1. We’re going to build our models the way we have been doing so all along. We need a one game lag to mimic not knowing the stats before the game. But now that we’ve worked with them this much, we should have an idea of what model works best for us, so we’re only going to do that one model. We don’t need multiple. We’ll stop at the fit.
  2. After we have a fit, we have to redo our feature engineering but this time without all the lags. We’re now at a point where the data we have is what we need. No more shifting it back one game because we’re officially in the future.
  3. The, we have to go through a process of joining to get the two teams we need for a particular matchup.
  4. Once we have our games, we can apply the fit to them and get a prediction.

It seems like a lot, and in terms of lines of code, it is. But it’s really a lot of copy paste work. Let’s get started.

library(tidyverse)
library(tidymodels)
library(zoo)
library(bonsai)
library(hoopR)

set.seed(1234)

We’ll continue to use what we’ve done for feature engineering – a rolling window of points per possession for team and opponent. You should be quite familiar with this by now.

teamgames <- load_mbb_team_box(seasons = 2015:2024)

teamstats <- teamgames |> 
  mutate(
    possessions = field_goals_attempted - offensive_rebounds + turnovers + (.475 * free_throws_attempted),
    ppp = team_score/possessions,
    oppp = opponent_team_score/possessions
  )

rollingteamstats <- teamstats |> 
  group_by(team_short_display_name, season) |>
  arrange(game_date) |>
  mutate(
    team_rolling_ppp = rollmean(lag(ppp, n=1), k=5, align="right", fill=NA),
    team_rolling_oppp = rollmean(lag(oppp, n=1), k=5, align="right", fill=NA)
    ) |> 
  ungroup()

team_side <- rollingteamstats |>
  select(
    game_id,
    team_id, 
    team_short_display_name, 
    opponent_team_id, 
    game_date, 
    season, 
    team_score, 
    team_rolling_ppp,
    team_rolling_oppp
    )

opponent_side <- team_side |>
  select(-opponent_team_id) |> 
  rename(
    opponent_team_id = team_id,
    opponent_short_display_name = team_short_display_name,
    opponent_score = team_score,
    opponent_rolling_ppp = team_rolling_ppp,
    opponent_rolling_oppp = team_rolling_oppp
  ) |>
  mutate(opponent_id = as.numeric(opponent_team_id)
)

games <- team_side |> inner_join(opponent_side)

games <- games |> mutate(
  team_result = as.factor(case_when(
    team_score > opponent_score ~ "W",
    opponent_score > team_score ~ "L"
)))

games$team_result <- relevel(games$team_result, ref="W")

modelgames <- games |> 
  select(
    game_id, 
    game_date, 
    team_short_display_name, 
    opponent_short_display_name, 
    season, 
    team_rolling_ppp, 
    team_rolling_oppp,
    opponent_rolling_ppp,
    opponent_rolling_oppp,
    team_result
    ) |>
  na.omit()

We’re going to go through the steps of modeling again, starting with splitting our modelgames data and going all the way down to the fit. For your notebooks, you only need one model and one fit, so it’s time to clean it all up.

game_split <- initial_split(modelgames, prop = .8)
game_train <- training(game_split)
game_test <- testing(game_split)

game_recipe <- 
  recipe(team_result ~ ., data = game_train) |> 
  update_role(game_id, game_date, team_short_display_name, opponent_short_display_name, season, new_role = "ID") |>
  step_normalize(all_predictors())

svm_mod <- 
  svm_poly() |>
  set_engine("kernlab") |>
  set_mode("classification") 

svm_workflow <- 
  workflow() |> 
  add_model(svm_mod) |> 
  add_recipe(game_recipe)

svm_fit <- 
  svm_workflow |> 
  fit(data = game_train)
 Setting default kernel parameters  

8.1 Redoing the feature engineering

Now that we have our fit based on known data, we need to redo our feature engineering so that we now have up to the moment data. You get that by just simply removing the lags. Make sure you remove the lag( parts AND the n=1) parts both.

rollingteamstats <- teamstats |> 
  group_by(team_short_display_name, season) |>
  arrange(game_date) |>
  mutate(
    team_rolling_ppp = rollmean(ppp, k=5, align="right", fill=NA),
    team_rolling_oppp = rollmean(oppp, k=5, align="right", fill=NA),
    ) |> 
  ungroup() 

team_side <- rollingteamstats |>
  select(
    game_id,
    team_id, 
    team_short_display_name, 
    opponent_team_id, 
    game_date, 
    season, 
    team_score, 
    team_rolling_ppp,
    team_rolling_oppp
    )

opponent_side <- team_side |>
  select(-opponent_team_id) |> 
  rename(
    opponent_team_id = team_id,
    opponent_short_display_name = team_short_display_name,
    opponent_score = team_score,
    opponent_rolling_ppp = team_rolling_ppp,
    opponent_rolling_oppp = team_rolling_oppp
  ) |>
  mutate(opponent_id = as.numeric(opponent_team_id)
)

games <- team_side |> inner_join(opponent_side)
Joining with `by = join_by(game_id, opponent_team_id, game_date, season)`
games <- games |> mutate(
  team_result = as.factor(case_when(
    team_score > opponent_score ~ "W",
    opponent_score > team_score ~ "L"
)))

games$team_result <- relevel(games$team_result, ref="W")

modelgames <- games |> 
  select(
    game_id, 
    game_date, 
    team_short_display_name, 
    opponent_short_display_name, 
    season, 
    team_rolling_ppp, 
    team_rolling_oppp,
    opponent_rolling_ppp,
    opponent_rolling_oppp,
    team_result
    ) |>
  na.omit()

Now we need to get the first games. Let’s start with the first round of last seaons’s Big Ten Tournament – we’ll pretend it’s happening today and the results are the same. To do this, we’re first going to make a tibble with the teams in the team_short_display_name and opponnent_short_display_name.

round1games <- tibble(
  team_short_display_name="Ohio State",
  opponent_short_display_name="Wisconsin"
) |> add_row(
  team_short_display_name="Minnesota",
  opponent_short_display_name="Nebraska"
)

Now with that, we need to get all the team data for our game and join it to our round 1 games. This will get the latest information, drop the team_result and all the opponent information and then add it to our round1games dataframe. Then it will do it again, but this time for the opponent side of the game.

round1games <- modelgames |> 
  group_by(team_short_display_name) |> 
  filter(game_date == max(game_date) & season == 2024) |> 
  ungroup() |> 
  select(-team_result, -starts_with("opponent")) |> 
  right_join(round1games)
Joining with `by = join_by(team_short_display_name)`
round1games <- modelgames |> 
  group_by(opponent_short_display_name) |> 
  filter(game_date == max(game_date) & season == 2024) |> 
  ungroup() |> 
  select(-team_result, -starts_with("team"), -game_id, -game_date, -season) |> 
  right_join(round1games) 
Joining with `by = join_by(opponent_short_display_name)`

Now, just like before, we apply our fits. The select at the end is just to move the right stuff up to the front of the table and make it easy for us to see what we need to see.

And who does our model think will win the first round?

round1 <- svm_fit |> predict(new_data = round1games) |>
  bind_cols(round1games) |> select(.pred_class, team_short_display_name, opponent_short_display_name, everything())

round1 <- svm_fit |> predict(new_data = round1games, type="prob") |>
  bind_cols(round1) |> select(.pred_class, .pred_W, .pred_L, team_short_display_name, opponent_short_display_name, everything())

round1
# A tibble: 2 × 12
  .pred_class .pred_W .pred_L team_short_display_name opponent_short_display_n…¹
  <fct>         <dbl>   <dbl> <chr>                   <chr>                     
1 L             0.321   0.679 Ohio State              Wisconsin                 
2 W             0.516   0.484 Minnesota               Nebraska                  
# ℹ abbreviated name: ¹​opponent_short_display_name
# ℹ 7 more variables: opponent_rolling_ppp <dbl>, opponent_rolling_oppp <dbl>,
#   game_id <int>, game_date <date>, season <int>, team_rolling_ppp <dbl>,
#   team_rolling_oppp <dbl>