All things feature engineering
%load_ext autoreload
%autoreload 2
import torch
import numpy as np

Read the example df

read_football_csv[source]

read_football_csv(path)

df = read_football_csv(Path('../data/football_data_uk/raw/germany/D1_1415.csv'))
df.head()
Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR ... BbAv<2.5 BbAH BbAHh BbMxAHH BbAvAHH BbMxAHA BbAvAHA PSCH PSCD PSCA
0 D1 2014-08-22 Bayern Munich Wolfsburg 2 1 H 1 0 H ... 3.00 22 -2.00 2.35 2.24 1.73 1.68 1.29 6.67 10.58
1 D1 2014-08-23 Dortmund Leverkusen 0 2 A 0 1 A ... 2.39 26 -1.00 2.08 2.02 1.87 1.84 1.75 4.18 4.77
2 D1 2014-08-23 Ein Frankfurt Freiburg 1 0 H 1 0 H ... 2.01 22 -0.50 2.05 2.01 1.90 1.86 2.01 3.74 3.92
3 D1 2014-08-23 FC Koln Hamburg 0 0 D 0 0 D ... 2.00 21 0.00 1.50 1.47 2.92 2.70 2.06 3.62 3.86
4 D1 2014-08-23 Hannover Schalke 04 2 1 H 0 0 D ... 2.15 22 0.25 1.89 1.84 2.08 2.04 3.10 3.60 2.37

5 rows × 67 columns

Extract teams

extract_teams[source]

extract_teams(df, home_team='HomeTeam', away_team='AwayTeam')

teams = extract_teams(df)
teams
['Bayern Munich',
 'Dortmund',
 'Ein Frankfurt',
 'FC Koln',
 'Hannover',
 'Hertha',
 'Hoffenheim',
 'Paderborn',
 "M'gladbach",
 'Augsburg',
 'Hamburg',
 'Leverkusen',
 'Schalke 04',
 'Stuttgart',
 'Werder Bremen',
 'Wolfsburg',
 'Freiburg',
 'Mainz']

Game day

There should be n_teams/2 games per gameday.

add_gamedays[source]

add_gamedays(df, home_team='HomeTeam', away_team='AwayTeam')

df = add_gamedays(df)

Points

  • 3 points for the winning team
  • 0 points for the losing team
  • 1 point for each team when they draw
  • Extract all playing teams (sanity check)
  • current_points: start with 0 points for each team
  • home/away_points: empty list
  • Go trough the matches in order of date (=time of kickoff)
    • note current_points for each team
    • update current points according to match outcome
  • add columns for home/away points

Points accumulation

add_points[source]

add_points(df, home_team='HomeTeam', away_team='AwayTeam', date='Date')

df = add_points(df)
df.tail()
Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR ... BbMxAHH BbAvAHH BbMxAHA BbAvAHA PSCH PSCD PSCA gameday home_points away_points
301 D1 2015-05-23 Hamburg Schalke 04 2 0 H 0 0 D ... 2.11 2.04 1.85 1.80 2.09 3.89 3.54 33 32 48
303 D1 2015-05-23 Hoffenheim Hertha 2 1 H 1 0 H ... 2.00 1.95 1.98 1.90 2.14 3.57 3.68 33 41 35
302 D1 2015-05-23 Hannover Freiburg 2 1 H 1 0 H ... 1.97 1.92 1.98 1.92 2.02 3.54 4.11 33 34 34
304 D1 2015-05-23 M'gladbach Augsburg 1 3 A 1 0 H ... 1.94 1.89 2.02 1.96 1.47 5.00 7.05 33 66 46
305 D1 2015-05-23 Paderborn Stuttgart 1 2 A 1 1 D ... 2.09 2.02 1.88 1.83 3.56 4.13 2.01 33 31 33

5 rows × 70 columns

Positions

add_positions[source]

add_positions(df)

df = add_positions(df)
df.head()
Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR ... BbMxAHA BbAvAHA PSCH PSCD PSCA gameday home_points away_points home_position away_position
0 D1 2014-08-22 Bayern Munich Wolfsburg 2 1 H 1 0 H ... 1.73 1.68 1.29 6.67 10.58 0 0 0 1 1
1 D1 2014-08-23 Dortmund Leverkusen 0 2 A 0 1 A ... 1.87 1.84 1.75 4.18 4.77 0 0 0 1 1
2 D1 2014-08-23 Ein Frankfurt Freiburg 1 0 H 1 0 H ... 1.90 1.86 2.01 3.74 3.92 0 0 0 1 1
3 D1 2014-08-23 FC Koln Hamburg 0 0 D 0 0 D ... 2.92 2.70 2.06 3.62 3.86 0 0 0 1 1
4 D1 2014-08-23 Hannover Schalke 04 2 1 H 0 0 D ... 2.08 2.04 3.10 3.60 2.37 0 0 0 1 1

5 rows × 72 columns

Simple diffs

add_simple_diffs[source]

add_simple_diffs(df)

df = add_simple_diffs(df)
df.tail()
Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR ... PSCH PSCD PSCA gameday home_points away_points home_position away_position points_diff position_diff
301 D1 2015-05-23 Hamburg Schalke 04 2 0 H 0 0 D ... 2.09 3.89 3.54 33 32 48 14 5 -16 9
303 D1 2015-05-23 Hoffenheim Hertha 2 1 H 1 0 H ... 2.14 3.57 3.68 33 41 35 8 11 6 -3
302 D1 2015-05-23 Hannover Freiburg 2 1 H 1 0 H ... 2.02 3.54 4.11 33 34 34 12 12 0 0
304 D1 2015-05-23 M'gladbach Augsburg 1 3 A 1 0 H ... 1.47 5.00 7.05 33 66 46 3 6 20 -3
305 D1 2015-05-23 Paderborn Stuttgart 1 2 A 1 1 D ... 3.56 4.13 2.01 33 31 33 15 13 -2 2

5 rows × 74 columns

Result

df.head()
Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR ... PSCH PSCD PSCA gameday home_points away_points home_position away_position points_diff position_diff
0 D1 2014-08-22 Bayern Munich Wolfsburg 2 1 H 1 0 H ... 1.29 6.67 10.58 0 0 0 1 1 0 0
1 D1 2014-08-23 Dortmund Leverkusen 0 2 A 0 1 A ... 1.75 4.18 4.77 0 0 0 1 1 0 0
2 D1 2014-08-23 Ein Frankfurt Freiburg 1 0 H 1 0 H ... 2.01 3.74 3.92 0 0 0 1 1 0 0
3 D1 2014-08-23 FC Koln Hamburg 0 0 D 0 0 D ... 2.06 3.62 3.86 0 0 0 1 1 0 0
4 D1 2014-08-23 Hannover Schalke 04 2 1 H 0 0 D ... 3.10 3.60 2.37 0 0 0 1 1 0 0

5 rows × 74 columns

results_from_goals[source]

results_from_goals(df, home_col, away_col)

Compares goals in home_col to goals in away_col, creates [new_col] with results encoded as: -1 -> home win 0 -> draw 1 -> away win

result_df = pd.DataFrame(columns=['home', 'away'])
result_df['home'] = [1,2,3,0]
result_df['away'] = [2,1,3,0]


result_df['result'] = results_from_goals(result_df, 'home', 'away')
result_df.head()
home away result
0 1 2 1
1 2 1 -1
2 3 3 0
3 0 0 0
assert (result_df.result.values == [1,-1,0,0]).all()

Profit odds

odds_df = result_df.copy()
odds_df[['odds_home', 'odds_draw', 'odds_away']] = df.loc[:3, ['B365H', 'B365D', 'B365A']]

odds_df.head()
home away result odds_home odds_draw odds_away
0 1 2 1 1.25 6.00 10.0
1 2 1 -1 1.57 4.33 5.0
2 3 3 0 2.05 3.40 3.6
3 0 0 0 2.00 3.50 3.6

create_profit_df[source]

create_profit_df(df, odds_home, odds_draw, odds_away, home_profit='y_home', draw_profit='y_draw', away_profit='y_away', df_result_col='result')

add_profit_cols[source]

add_profit_cols(df, odds_home, odds_draw, odds_away, home_profit='y_home', draw_profit='y_draw', away_profit='y_away', df_result_col='result')

create_profit_df(odds_df, 'odds_home', 'odds_draw', 'odds_away')
y_home y_draw y_away
0 -1.00 -1.0 9.0
1 0.57 -1.0 -1.0
2 -1.00 2.4 -1.0
3 -1.00 2.5 -1.0
add_profit_cols(odds_df, 'odds_home', 'odds_draw', 'odds_away')
home away result odds_home odds_draw odds_away y_home y_draw y_away
0 1 2 1 1.25 6.00 10.0 -1.00 -1.0 9.0
1 2 1 -1 1.57 4.33 5.0 0.57 -1.0 -1.0
2 3 3 0 2.05 3.40 3.6 -1.00 2.4 -1.0
3 0 0 0 2.00 3.50 3.6 -1.00 2.5 -1.0

Normalizer

df = pd.DataFrame([[1,2,3], [4,5,6]], columns=['a', 'b', 'c'])
df
a b c
0 1 2 3
1 4 5 6

normalize_by_args[source]

normalize_by_args(x, mean, std)

normalize_col[source]

normalize_col(col)

class ColumnNormalizer[source]

ColumnNormalizer(columns, names=None)

normalize_by_args(df.a, 5, 2.)
0   -2.0
1   -0.5
Name: a, dtype: float64
normalize_col(df.a)
0   -0.707107
1    0.707107
Name: a, dtype: float64
ColumnNormalizer(df.loc[:,['a', 'b']]), ColumnNormalizer.from_df(df, ['a','b'])
(Mean: 3.0 | Std: 1.5811388300841898 | Names: unknown,
 Mean: 3.0 | Std: 1.5811388300841898 | Names: ['a', 'b'])
norm = ColumnNormalizer.from_df(df, ['a','b'])

df.loc[:, norm.names] = norm(df.loc[:, norm.names])
df
a b c
0 -1.264911 -0.632456 3
1 0.632456 1.264911 6
assert df.loc[:, ['a', 'b']].values.mean()==0., 'ColumnNormalizer should produce 0 mean.'
assert df.loc[:, ['a', 'b']].values.std()==1., 'ColumnNormalizer should produce 1. std.'

Profit loss

Computes the outcome of a betting allocation. The loss function gets negated to make minimizing achieve the goal we're aiming for.

odds_loss[source]

odds_loss(actual, target)

Compute the mean negative profit

odds_profit[source]

odds_profit(actual, target)

Compute the total profit

odds_df = pd.DataFrame([[3.2, 3.25, 2.29], [1.91, 3.3, 3.93]], columns=['home', 'draw', 'away'])
odds_df.head()
home draw away
0 3.20 3.25 2.29
1 1.91 3.30 3.93
allocations = torch.tensor([[.5, .2, .3], [.8, .05, .15]])

odds_loss(allocations, odds_df.values), odds_profit(allocations, odds_df.values)
(tensor(-2.8457, dtype=torch.float64), tensor(5.6913, dtype=torch.float64))

Export