Methodology
As what we’re interested in is a team’s overall performance and the data set that we’re looking at is structured in terms of each individual player’s performance, our first hurdle we had to jump over was getting the data set into a more workable form for what we’ve set out to achieve.
The first thing we did was deleted columns we deemed would be necessary going forward in our analysis; the obvious candidates to go were ‘Player’ and ‘Percent Played’ as they’d be useless once we combined everything to be in terms of any teams not players. However, slightly less obvious variables that we removed were ‘FreesAgainst’ and ‘Uncontested Possessions.’ We did this is because our once one teams statistics are put against another teams statistics for comparison one team’s ‘FreesFor’ would be exactly the same as the opposing team, and vise versa (same thing for Contested vs Uncontested Possessions). So, to prevent future problems and clutter we deleted these four columns completely from our data set.
AFL <- AFLstats
AFL <- AFL[c(1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,27,28,29,30,31)]
#colnames(AFL)
The next logical step in our methodology was to actually split our data set into the different teams total games as that’s what we’re most interested in, how well one team would perform against any other.
all_teams <- unique(as.character(AFL$Team))
ad <- AFL[AFL$Team == all_teams[1],]
bl <- AFL[AFL$Team == all_teams[2],]
clt <- AFL[AFL$Team == all_teams[3],]
clwd <- AFL[AFL$Team == all_teams[4],]
esdn <- AFL[AFL$Team == all_teams[5],]
fd <- AFL[AFL$Team == all_teams[6],]
glng <- AFL[AFL$Team == all_teams[7],]
gc <- AFL[AFL$Team == all_teams[8],]
gws <- AFL[AFL$Team == all_teams[9],]
ht <- AFL[AFL$Team == all_teams[10],]
mlb <- AFL[AFL$Team == all_teams[11],]
nmlb <- AFL[AFL$Team == all_teams[12],]
pad <- AFL[AFL$Team == all_teams[13],]
rmd <- AFL[AFL$Team == all_teams[14],]
sk <- AFL[AFL$Team == all_teams[15],]
syd <- AFL[AFL$Team == all_teams[16],]
wc <- AFL[AFL$Team == all_teams[17],]
wb <- AFL[AFL$Team == all_teams[18],]
After this had been done we still faced the problem that our data is in terms of rows of player statistics, and not rows of total game statistics. Through rigorous trial and error we created a nice, neat function which cleans all the data by wrangling it into the form we want it to be in (total statistics for any given game). It is also important to note that only five of the total games in the data set resulted in a draw (shown below), so in an effort so simplify our the data we decided it would be best to just remove these cases (this occurs within the cleaning function).
# players which were part of a tie game / number of players in team / number of teams in a match
nrow(AFL[AFL$WinLoss == "D",]) / 22 / 2
[1] 5
This is the function which ‘cleans’ our data. It essential takes the first nine columns and takes one copy of the information there as it’s unique to each match. The remaining columns are player statistics so they are just summed together with all the other similar variables within any match to get the data into the form we want it to be in. As mentioned before, the games that were tied were removed from the data set in this step.
cleanAFLdata <- function(team){
library(plyr)
team_stats <- ddply(team, .(Date), numcolwise(sum))
team_stats <- team_stats[c(1,5:23)]
team_keep <- team[match(unique(team$Date), team$Date),]
team_keep <- team_keep[1:9]
team_clean <- merge(team_keep, team_stats)
team_clean <- team_clean[team_clean$WinLoss != "D",]
team_clean$WinLoss <- droplevels(team_clean$WinLoss)
levels(team_clean$WinLoss)[levels(team_clean$WinLoss)=="L"] <- 0
levels(team_clean$WinLoss)[levels(team_clean$WinLoss)=="W"] <- 1
team_clean$WinLoss <- factor(team_clean$WinLoss)
return(team_clean)
}
ad_clean <- cleanAFLdata(ad)
bl_clean <- cleanAFLdata(bl)
clt_clean <- cleanAFLdata(clt)
clwd_clean <- cleanAFLdata(clwd)
esdn_clean <- cleanAFLdata(esdn)
fd_clean <- cleanAFLdata(fd)
glng_clean <- cleanAFLdata(glng)
gc_clean <- cleanAFLdata(gc)
gws_clean <- cleanAFLdata(gws)
ht_clean <- cleanAFLdata(ht)
mlb_clean <- cleanAFLdata(mlb)
nmlb_clean <- cleanAFLdata(nmlb)
pad_clean <- cleanAFLdata(pad)
rmd_clean <- cleanAFLdata(rmd)
sk_clean <- cleanAFLdata(sk)
syd_clean <- cleanAFLdata(syd)
wc_clean <- cleanAFLdata(wc)
wb_clean <- cleanAFLdata(wb)
After going through all this effort to split the data to be in terms of separate teams it actually meant that for each team we only had a very small set of data points. This made any prediction models for each individual team impossible as the training set (2012 - 2015) only had roughly 100 rows and our test set (2016) only had about 25 rows; which R didn’t think was enough data for the predictive models we wanted to do. So we ended up just combining everything back into one nice big data set (after being cleaned).
AFL_clean <- rbind(ad_clean, bl_clean, clt_clean, clwd_clean, esdn_clean, fd_clean, glng_clean,
gc_clean, gws_clean, ht_clean, mlb_clean, nmlb_clean, pad_clean, rmd_clean,
sk_clean, syd_clean, wc_clean, wb_clean)
#length(unique(AFL_clean$Team))
Once our data was once again in one big, combined data frame we decided (and R accepted our decision) it was finally time to successfully split our data into a separate training and test set. It made sense for our training set to be all the games in 2012 - 2015 (inclusive) and then our test set to be just the matches played in the latest (in our data set) 2016 season. This made sense as our goal is to predict the current (2017) matches so having our test set being the closest to that as possible meant our predictive models would (in theory) ‘tune’ our model to be as accurate for the current season as we could hope for.
AFL.test <- AFL_clean[AFL_clean$Season == 2016,]
#head(AFL.test)
AFL.train <- AFL_clean[AFL_clean$Season != 2016,]
#head(AFL.train)
Next we decided to do a little exploratory analysis of our training set, this was to make sure everything makes logical sense (check we haven’t made an error during our cleaning process), and just to explore the data and see if we can find anything interesting.
plot(AFL.train$WinLoss, AFL.train$Disposals)

plot(AFL.train$WinLoss, AFL.train$Inside50s)

plot(AFL.train$WinLoss, AFL.train$Clearances)

From these few plots we can see that our data looks to be intact, as we actually got given functional box plots as outputs. And more importantly it seems that at least some of these variables have some sort of a correlation with teams who’ve won (1) and teams who’ve lost (0). This meant that some of our variables might be strong candidates in being able to assistance us in predicting the outcomes of future games.
As a second measure to this process we created a correlation coefficient histogram matrix with all of our variables to see if anything had a particularly strong correlation with our ‘WinLoss’ variable.
source("PairsWithHist.R")
pairs(AFL_clean, lower.panel = panel.pts, upper.panel = panel.cor, diag.panel = panel.hist)
Our next step was to actually begin preparing to make some predictive models; which wasn’t just as simple as fitting a model with all the variables and then fitting another model with none, and then running forward selection between the two (although that was one step within the process). After some trial and error and thought into what we were actually doing some things became clear; we had too many variables and some variables just didn’t make sense being in our model(s).
To reduce our total amount of variables we first removed all the categorical explanatory variables. Our reason for doing this is that each level of a factor variable within a predictive model becomes its own variable, not just the one factor variable itself. For example, by including the factor variable ‘Venue’ we are introducing is 19 new explanatory variables into our model, not simply just venue.
The next problem was that the ‘Margin’ variable was a perfect predictor of whether a team would win or lose. If the margin for a game was greater than zero the team had won, if the margin was negative then the team had lost. By including this in our model it would have had such a strong correlation that all the other variables, which are arguably more useful for our overall goal, would be lost as noise; so we had to remove it.
Variables like ‘Score’ and ‘Goals’ were in a similar sort of boat because teams who win are always going to have a higher score, because that’s how you win the game. This is such a basic premise that it really made no sense to include in our model as it was just adding noise. And the last set of variables we didn’t include in our model were ‘Handballs’ and ‘Kicks’ as they are just subsets of the ‘Disposal’ variable, making it pointless to include all three within our model.
After all of these ‘types’ of variables were excluded from our model we were left with what we with the majority of the integer match statistics like ‘Disposals,’ ‘Inside50s,’ ‘Clangers,’ etc.
# just 'numerical' variables
AFL.glm.num <- glm(WinLoss ~ Disposals + Marks + Behinds + Inside50s + Clearances + Clangers + ContendedPossessions + ContestedMarks + MarksInside50,
data = AFL.train, family = binomial(logit))
Then we set up a model with no explanatory variables.
AFL.glm.0 <- glm(WinLoss ~ 1, data = AFL.train, family = binomial(logit))
And finally we compared these two models to find the optimal set of variables we should be including in our final model. Which by looking at the summary below you might have noticed left us with seven explanatory variables in our final model; removing ‘ContestedMarks’ and ‘Behinds’ as forward selection didn’t find them relevant.
All the variables left in the model also have very small p-values which is always a good sign that they could be highly correlated to the response variable (winning a match).
AFL.glm.fwd <- step(AFL.glm.0, scope = formula(AFL.glm.num), direction = "forward", trace = 0)
summary(AFL.glm.fwd)
Call:
glm(formula = WinLoss ~ Inside50s + Marks + ContendedPossessions +
MarksInside50 + Clangers + Clearances + Disposals, family = binomial(logit),
data = AFL.train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.69869 -0.70594 -0.01347 0.69685 2.59537
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -17.993850 1.172142 -15.351 < 2e-16 ***
Inside50s 0.067637 0.010162 6.656 2.82e-11 ***
Marks 0.040351 0.005638 7.157 8.24e-13 ***
ContendedPossessions 0.047621 0.006556 7.264 3.77e-13 ***
MarksInside50 0.158085 0.019926 7.933 2.13e-15 ***
Clangers -0.031204 0.009413 -3.315 0.000917 ***
Clearances 0.034660 0.012002 2.888 0.003878 **
Disposals 0.007527 0.002734 2.753 0.005898 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2279.1 on 1643 degrees of freedom
Residual deviance: 1463.4 on 1636 degrees of freedom
AIC: 1479.4
Number of Fisher Scoring iterations: 5
Analysis
Logistic Regression
The first predictive model that we decided to implement was a Logistic Regression model, with ‘WinLoss’ being our response variable, obviously with a ‘1’ (winning game) being the preferred outcome.
AFL.all.pred <- predict(AFL.glm.fwd, newdata = AFL.test, type = "response")
#head(AFL.all.pred)
After the model had made its predictions using our test data to ‘tune’ it, we had to compare these predictions to the actual true, predicted outcomes for 2016. The most efficient way of going about this was to create a confusion matrix of our predicted results against the true results, this gives us an idea of how accurate our model actually is.
AFL.all.pred.class <- rep(0, length(AFL.all.pred))
AFL.all.pred.class[AFL.all.pred > 0.75] <- 1
table(AFL.all.pred.class, AFL.test$WinLoss)
AFL.all.pred.class 0 1
0 188 75
1 19 132
(188+132)/(188+75+19+132)
[1] 0.7729469
From this confusion matrix we can see that about 77% of our predictions were correct when we used a cut-off value of 0.75. This means that any game predicted by our model that has above a 75% chance of winning will only then be classified as a ‘sure win.’ This cut-off seems like a fair assumption because, assuming the model actually worked in the way we wanted it to, every four games you placed a bet on you’d only lose one of those four. And when you factor in even some average odds you’d almost always come out profitable by a long shot if that was the case.
This model has a pretty reasonable success rate even when this high cut-off is put in place however, it suffers from the same ‘problem’ that all our other predictive models do (discussed after all the predictive models have been ‘showcased’).
K-Nearest Neighbours
After our Logistic Regression model came out as such a success we decided that the next best step would be to try a K-Nearest Neighbors model to see if it’s more or less accurate when compared to our previous predictive model.
library(class)
AFL.knn.pred <- knn(AFL.train[,c("Disposals", "Marks", "Inside50s", "Clearances", "Clangers", "ContendedPossessions", "MarksInside50")], AFL.test[,c("Disposals", "Marks", "Inside50s", "Clearances", "Clangers", "ContendedPossessions", "MarksInside50")], AFL.train$WinLoss,
k = 3)
table(AFL.knn.pred, AFL.test$WinLoss)
AFL.knn.pred 0 1
0 139 39
1 68 168
(139+167)/(139+40+68+167)
[1] 0.7391304
And, perhaps unsurprisingly, the K-Nearest Neighbors model resulted in a predicted probability which is only roughly 4% behind the Logistic Regression model. This shows again that our models are consistently predicting at a much better rate then a 50/50 guess. However, this model uses the same set of variables and thus suffers from the same problem as the previous.
Decision Tree(s)
We’d done a Logistic Regression and a K-Nearest Neighbors model, so it really only made sense to finish the trio by attempting to create a Decision Tree or Many Forests model.
library(rpart)
AFL.num.tree <- rpart(WinLoss ~ Disposals + Marks + Behinds + Inside50s + Clearances + Clangers + ContendedPossessions + ContestedMarks + MarksInside50,
AFL_clean)
AFL.num.tree.pred = predict(AFL.num.tree, AFL.test)
AFL.num.tree.pred <- AFL.num.tree.pred[,2]
#head(AFL.num.tree.pred)
AFL.num.tree.class <- rep(0, length(AFL.num.tree.pred))
AFL.num.tree.class[AFL.num.tree.pred > 0.75] <- 1
table(AFL.num.tree.class, AFL.test$WinLoss)
AFL.num.tree.class 0 1
0 172 70
1 35 137
(172 + 137)/(172+70+35+137)
[1] 0.7463768
To our surprise the Many Forests model actually predicted with an almost identical success rate as the K-Nearest Neighbors model did. I guess this just shows that our variables must have quite even splits with respect to wins and losses. Again, this model suffers from the same problem as the previous two.
However, this model wasn’t as useless as the previous two models because we can actually visualize the different splits that the Decision Tree uses which gives us valuable insight into how different combinations of variables interact to either improve your chances of winning or hurt them.
library(rpart.plot)
library(randomForest)
library(partykit)
plot(as.party(AFL.num.tree))
The Decision Tree diagram is one of the more valuable things to come out of the analysis with respect to our goal to predict the outcome of future games. By just following the right most path of the diagram we can see that more Inside50s paired with more MarksInside50 is a ‘recipe’ for a very high chance of winning; which seems quite obvious. By following the left most path we can see the complete opposite, fewer Inside50s paired with fewer MarksInside50 results more often then not in a lower chance of winning; which again seems quite obvious.
This diagram only really starts to get interesting when you look at the combinations of the many different variables. For example if a team isn’t getting a lot of Inside50s but is getting a lot of MarksInside50, and then also the team has a decent amount of Marks overall then they suddenly have a much better chance at winning.
This diagram might be especially useful for something like half-time betting. If two teams are almost neck to neck at half-time but then one team is following one of these ‘paths’ that lead to a higher proportion of wins then it might be in your best interest to bet on that team over their opposition. In no way is it a perfect set of paths that lead to victory for a team however, using it is certainly a little more useful then blindly picking one team or the other.
The Problem
Perhaps a little foresight into what our models are actually doing would’ve been useful and saved us a lot of sort of wasted time. The problem with all our ‘predictive’ models is that they’re not actually really predicting anything useful. All they are doing is classifying a game as either a win or a loss using given the post-game statistics for a team. Statistics which can only be collected once a game has finished, not before a game has been played which was the whole ‘problem’ we set out to solve.
This makes our models sort of useless compared to the way we’d imaged them but not completely useless. The fact that they could predict three out of every four games correct with just the variables we gave them to work with reinforces the fact that some sort of correlation exists between those variables and a team winning a game. To actually solve our problem it might have been a lot more useful to be looking at quarter-time or half-time teams statistics and then seeing how those relate to a win or a loss.
Plotting
After our ‘predictive’ models really didn’t help us with what we wanted to achieve we had to take a step back and think about a more specific way of analyzing the data which actually somewhat helps in predicting the outcomes of future matches. We thought of two main things that we were curious to look into:
- How often are individual teams winning & losing within our data set, and how does this compare to their performance in the current 2017 season?
The first thing we needed to do was separate the total wins and loses with respect to each team throughout the whole data set. Then we put them into descending order because that’s how we intended them to be shown within our plots we planned on making.
table_all <- table(AFL_clean$Team, AFL_clean$WinLoss)
all_freq <- as.data.frame.matrix(table_all)
all_freq <- all_freq[order(all_freq$`1`, decreasing = TRUE),]
#all_freq
This wasn’t the end of the story though. Simply graphing each team’s total frequency of wins and losses and then ordering them by number of wins would be completely unfair. Some teams have just played more games than other teams which means they’d by default probably have more wins because they’ve simply played more games. Thus, to show them fairly next to each other we have to get them in terms of their win/loss percentage or proportion. That is, out of all the games they’ve played what fraction of them did they win, and what fraction did they lose?
win_perc <- all_freq$`1` / (all_freq$`0` + all_freq$`1`)
loss_perc <- 1 - win_perc
all_perc <- cbind(all_freq, loss_perc, win_perc)
all_perc <- all_perc[3:4]
colnames(all_perc) <- c("Loss Percentage", "Win Percentage")
#all_perc
all_perc_decr <- all_perc[order(all_perc$`Win Percentage`, decreasing = TRUE),]
#all_perc_decr
mosaicplot(all_perc_decr, col=c(rgb(215/255,25/255,28/255),rgb(166/255,217/255,106/255)), main = "Total Win vs Loss Percentage for each team (2012-2016) [Descending Order]", cex.axis = 1.2)

Here we’ve just split the above plot in half so it’s easier to see and what we’ve done is compared the rank of the teams in our descending win percentage plot from the 2012 - 2016 data against the current 2017 ladder positions (as of 31-05-2017). A green dot next to a team represents them being in the same half of both the 2017 ladder and our 2012 - 2016 descending win percentage rankings. As you might have noticed we by doing this we only managed to place just over half of the teams in their correct halves, 56% to be exact. However, quite a few of the teams are only off by a one or two positions, and the seasons not finished yet so there’s always room for the teams to do the natural thing and converge on their relative historical rankings.
Not to mention this season of AFL has just been particularly unpredictable; teams who’ve generally been on top have fallen and teams who’ve generally been at the bottom have started to rise. Just look at GWS; from 2012 - 2016 they have the fourth lowest win percentage yet they’re currently second on the ladder. This probably comes down to them being a young team and each year their players are getting exponentially more experienced and playing better as a result, but it’s interesting nonetheless.
- Are certain teams winning a lot more often at certain venues, and does this have anything to do with how much experience they have playing at these venues?
To answer these questions we first had to determine which venues all of the teams have played at so we have a decent sample size of matches to work with, and to make sure each team is represented within each venue’s analysis. To do this we plotted our score variable for each team filtered by each different venue we have in our data set.
library(lattice)
barchart(AFL_clean$Score ~ AFL_clean$Team | AFL_clean$Venue)

Using this plot and comparing the total amount of games played at each venue we narrowed the venues we wanted to look at down to Subiaco, Adelaide Oval, M.C.G, S.C.G and The Gabba. These locations were also picked as they’re the most popular venues for Perth, Adelaide, Melbourne, Sydney and Brisbane respectively; meaning if there’s any correlation between the teams playing on their home ground we should be able to see it across all the main venues.
To do this we had to extract the wins and losses that each team has had when only looking at a specific venue.
venueFilter <- function(venue){
AFL_venue <- AFL_clean[AFL_clean$Venue == venue,]
venue_table <- table(AFL_venue$Team, AFL_venue$WinLoss)
venue_df <- as.data.frame.matrix(venue_table)
venue_freq <- venue_df[order(venue_df$`1`, decreasing = TRUE),]
win_perc <- venue_freq$`1` / (venue_freq$`0` + venue_freq$`1`)
loss_perc <- 1 - win_perc
all_perc <- cbind(venue_freq, loss_perc, win_perc)
all_perc <- all_perc[3:4]
colnames(all_perc) <- c("Loss Percentage", "Win Percentage")
#all_perc
team_total <- (venue_freq$`0` + venue_freq$`1`)
game_totals <- sum(team_total)
team_proportion <- team_total / game_totals
venue_freq <- cbind(venue_freq, team_proportion)
#venue_freq
all_venue <- cbind(venue_freq, all_perc)
all_venue <- all_venue[order(all_venue$`Win Percentage`, decreasing = TRUE),]
return(all_venue)
}
subiaco <- venueFilter("Subiaco")
syd.showground <- venueFilter("Sydney Showground")
gabba <- venueFilter("Gabba")
mcg <- venueFilter("M.C.G.")
scg <- venueFilter("S.C.G.")
adl.oval <- venueFilter("Adelaide Oval")
carrara <- venueFilter("Carrara")
docklands <- venueFilter("Docklands")
ftbl.park <- venueFilter("Football Park")
I’ve just included the code for only looking at The Gabba but every single plot was made using the same code and just replacing the venue with whichever one you were interested in.
library(ggplot2)
gabba.tprop <- gabba[3]
bp <- ggplot(gabba.tprop, aes(x="", y=team_proportion, fill=row.names(gabba.tprop)) ) + geom_bar(width = 1, stat = "identity") + scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9", "#8fc0a0", "#cc9b5b", "#71529b", "#7bf68d", "#96270f", "#1e46be", "#b60edc", "#678dbd", "#4eb20b", "#5f37e5", "#02339b", "#a7a4df", "#2b681e", "#2c3f7c", "#d9ed8f"))
bp

pie <- bp + coord_polar("y", start=0)
pie

gabba.wperc <- gabba[4:5]
#subiaco.wperc
mosaicplot(gabba.wperc, col=c(rgb(215/255,25/255,28/255),rgb(166/255,217/255,106/255)), main = "Total Win vs Loss Percentage for each team at Subiaco Oval (2012-2016) [Descending Order]", cex.axis = 1.2)

So you don’t have to look at the same code ten times and squint trying to look at the downsized R-output I’ve just included the slides I made from the power point, but like I said before it was all made using the exact same code above.
Subiaco
Subiaco is the home ground of Fremantle and West Coast and as seen on the first plot they’re also in the top two places for win percentage when just looking at games played at Subiaco. The second plot shows the fraction of games that each individual team has played at Subiaco and by no surprise Fremantle and West Coast together make up about 60% of the games. This leaves the other 16 teams to share the remaining 40% of games between themselves, which equates to about six games each.
After looking at these plots it makes a lot more sense that Fremantle and West Coast are topping the win percentage at Subiaco because they’re so disproportionately represented at the venue. Not to mention that all the other teams have to fly quite a distance to get to Subiaco, this probably contributes heavily to the fact that the majority of the teams have less than a 50% win rate.
Relating back to our goal of predicting future outcomes of games, this gives us some very useful information. When Brisbane, Collingwood, Melbourne or St Kilda are playing at Subiaco it’s probably a good idea to bet against them, especially if they’re up against Fremantle or West Coast.

Adelaide Oval
Looking at Adelaide Oval we see a similar trend with the home teams, Adelaide and Port Adelaide, together making up about 60% of the total games played on their home turf. Similarily, this leaves the other 16 teams to share the remaining 40% of games, but this time that only equates to about four games each (as less games have been played at Adelaide Oval compared to Subiaco).
When we look at the win percentages there’s definitely some improvement as Port Adelaide have moved from overall being ranked 10th to being ranked 6th however, Adelaide has stayed stagnant in the same position. Interestingly we also have a couple different teams dominating the top positions and a couple more teams having not ever one a game at the venue.
Using this information we could say that if Port Adelaide is playing a team that’s on par with them, but they’re playing at Adelaide Oval, then they might just have that extra edge from the home advantage and come out on top. We see Brisbane, Collingwood and St Kilda being repeated on the bottom of the list with no wins what so ever, again, making your chances of winning when betting against them (if they’re playing at Subiaco or Adelaide Oval) that much better.

M.C.G.
The M.C.G is a much more popular venue with a total of 500 games played at it, which is more games than Subiaco and Adelaide Oval combined. It’s also in Melbourne which is within a couple hours drive of over half the AFL teams. These factors contribute the the M.C.G having a much more even spread of team’s total games played at the venue, as seen on the second plot each close-by team holds about 5% - 10% of the games instead of 30%.
However, even with the spread being a lot more even and no team(s) being extremelly disproportionately represented, we still see this trend of the surrounding, close-by teams playing better. Every single on of the close-by teams has either held their position or improving dramatically, with the Western Bulldogs climbing nearly half of the ranks. However, we still see Brisbane in the bottom few ranks adding to the pile of evidence that you should probably always bet against them if they’re playing an away game.

S.C.G.
The S.C.G has the smallest amount of games played at is compared to all the other venues we’re looking at, only 90 games it total which is less than 1/5th of the total games played at the S.C.G. This sort of explains the extreme dispoportionality that we see at the venue, with Sydney having played about half of all the games played there. Interestingly though GWS plays just about as often as an away team despite them being situated quite close by to the venue.
The win percentages are quite skewed here due to the small amount of games that have been played there by away teams, each team has only played about two or three games. This makes it hard to get any useful information from the win percentage plot however, it has to be noted that Brisbane has no wins again despite actually being roughly one of the closest away teams to the venue.

The Gabba
Looking at The Gabba we can fully confirm that things really aren’t looking too great for Brisbane. Despite it being their home ground and then having played about 50% of the total games at the venue they are still very close to the bottom of the ranks. Again it is strange how Gold Coast is within an hours drive of The Gabba yet they still play there as often as an away team.
The Gabba doesn’t have many more games played at it than the S.C.G however, it’s interesting that the skew is sort of in the completely opposite direction, with nearly half the teams having a (or close to a) 100% win rate. This makes sense when you consider that most of the teams would be facing up against the home team, Brisbane, who we’ve already established probably aren’t the best team to be putting your money on.

Appendix
AFLstats <- read.csv("stats.csv")
AFL <- AFLstats
AFL <- AFL[c(1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,27,28,29,30,31)]
colnames(AFL)
sum_names <- colnames(AFL)[10:28]
sum_names
all_teams <- unique(as.character(AFL$Team))
ad <- AFL[AFL$Team == all_teams[1],]
bl <- AFL[AFL$Team == all_teams[2],]
clt <- AFL[AFL$Team == all_teams[3],]
clwd <- AFL[AFL$Team == all_teams[4],]
esdn <- AFL[AFL$Team == all_teams[5],]
fd <- AFL[AFL$Team == all_teams[6],]
glng <- AFL[AFL$Team == all_teams[7],]
gc <- AFL[AFL$Team == all_teams[8],]
gws <- AFL[AFL$Team == all_teams[9],]
ht <- AFL[AFL$Team == all_teams[10],]
mlb <- AFL[AFL$Team == all_teams[11],]
nmlb <- AFL[AFL$Team == all_teams[12],]
pad <- AFL[AFL$Team == all_teams[13],]
rmd <- AFL[AFL$Team == all_teams[14],]
sk <- AFL[AFL$Team == all_teams[15],]
syd <- AFL[AFL$Team == all_teams[16],]
wc <- AFL[AFL$Team == all_teams[17],]
wb <- AFL[AFL$Team == all_teams[18],]
cleanAFLdata <- function(team){
library(plyr)
team_stats <- ddply(team, .(Date), numcolwise(sum))
team_stats <- team_stats[c(1,5:23)]
team_keep <- team[match(unique(team$Date), team$Date),]
team_keep <- team_keep[1:9]
team_clean <- merge(team_keep, team_stats)
team_clean <- team_clean[team_clean$WinLoss != "D",]
team_clean$WinLoss <- droplevels(team_clean$WinLoss)
levels(team_clean$WinLoss)[levels(team_clean$WinLoss)=="L"] <- 0
levels(team_clean$WinLoss)[levels(team_clean$WinLoss)=="W"] <- 1
team_clean$WinLoss <- factor(team_clean$WinLoss)
return(team_clean)
}
syd_clean <- cleanAFLdata(syd)
head(syd_clean)
ad_clean <- cleanAFLdata(ad)
bl_clean <- cleanAFLdata(bl)
clt_clean <- cleanAFLdata(clt)
clwd_clean <- cleanAFLdata(clwd)
esdn_clean <- cleanAFLdata(esdn)
fd_clean <- cleanAFLdata(fd)
glng_clean <- cleanAFLdata(glng)
gc_clean <- cleanAFLdata(gc)
gws_clean <- cleanAFLdata(gws)
ht_clean <- cleanAFLdata(ht)
mlb_clean <- cleanAFLdata(mlb)
nmlb_clean <- cleanAFLdata(nmlb)
pad_clean <- cleanAFLdata(pad)
rmd_clean <- cleanAFLdata(rmd)
sk_clean <- cleanAFLdata(sk)
syd_clean <- cleanAFLdata(syd)
wc_clean <- cleanAFLdata(wc)
wb_clean <- cleanAFLdata(wb)
AFL_clean <- rbind(ad_clean, bl_clean, clt_clean, clwd_clean, esdn_clean, fd_clean, glng_clean,
gc_clean, gws_clean, ht_clean, mlb_clean, nmlb_clean, pad_clean, rmd_clean,
sk_clean, syd_clean, wc_clean, wb_clean)
length(unique(AFL_clean$Team))
AFL.test <- AFL_clean[AFL_clean$Season == 2016,]
head(AFL.test)
AFL.train <- AFL_clean[AFL_clean$Season != 2016,]
head(AFL.train)
plot(AFL.train$WinLoss, AFL.train$Disposals)
library(ggplot2)
qplot(WinLoss, Disposals, data = AFL.train)
source("PairsWithHist.R")
pairs(AFL_clean, lower.panel = panel.pts, upper.panel = panel.cor, diag.panel = panel.hist)
# just 'numerical' variables
AFL.glm.num <- glm(WinLoss ~ Disposals + Marks + Behinds + Inside50s + Clearances + Clangers + ContendedPossessions + ContestedMarks + MarksInside50,
data = AFL.train, family = binomial(logit))
# 'all' variables (Margin excluded because too perfect)
AFL.glm.all <- glm(WinLoss ~ Score + Opposition + Venue + Disposals + Marks + Goals + Behinds + Hitouts + Tackles + Rebound50s + Inside50s + Clearances + Clangers + FreesFor + ContendedPossessions + ContestedMarks + MarksInside50 + OnePercenters + Bounces+GoalAssists,
data = AFL.train, family = binomial)
#summary(AFL.glm.all)
AFL.glm.0 <- glm(WinLoss ~ 1, data = AFL.train, family = binomial(logit))
AFL.glm.fwd <- step(AFL.glm.0, scope = formula(AFL.glm.num), direction = "forward", trace = 0)
summary(AFL.glm.fwd)
AFL.all.pred <- predict(AFL.glm.fwd, newdata = AFL.test, type = "response")
head(AFL.all.pred)
AFL.all.pred.class <- rep(0, length(AFL.all.pred))
AFL.all.pred.class[AFL.all.pred > 0.75] <- 1
table(AFL.all.pred.class, AFL.test$WinLoss)
(188+132)/(188+75+19+132)
library(rpart)
AFL.tree <- rpart(WinLoss ~ Opposition + Venue + Disposals + Marks + Goals + Behinds + Hitouts + Tackles + Rebound50s + Inside50s + Clearances + Clangers + FreesFor + ContendedPossessions + ContestedMarks + MarksInside50 + OnePercenters + Bounces + GoalAssists,
AFL_clean)
#summary(AFL.tree)
library(rpart)
AFL.num.tree <- rpart(WinLoss ~ Disposals + Marks + Behinds + Inside50s + Clearances + Clangers + ContendedPossessions + ContestedMarks + MarksInside50,
AFL_clean)
AFL.num.tree.pred = predict(AFL.num.tree, AFL.test)
AFL.num.tree.pred <- AFL.num.tree.pred[,2]
head(AFL.num.tree.pred)
AFL.num.tree.class <- rep(0, length(AFL.num.tree.pred))
AFL.num.tree.class[AFL.num.tree.pred > 0.75] <- 1
table(AFL.num.tree.class, AFL.test$WinLoss)
(172 + 137)/(172+70+35+137)
library(rpart.plot)
library(randomForest)
library(partykit)
plot(as.party(AFL.tree))
library(rpart)
AFL.teams <- rpart(WinLoss ~ Team + Opposition,
AFL_clean)
library(rpart.plot)
library(randomForest)
library(partykit)
plot(as.party(AFL.teams))
library(rpart)
AFL.venue <- rpart(WinLoss ~ Team + Venue,
AFL_clean)
library(rpart.plot)
library(randomForest)
library(partykit)
plot(as.party(AFL.venue))
library(rpart.plot)
library(randomForest)
library(partykit)
plot(as.party(AFL.num.tree))
library(class)
AFL.knn.pred <- knn(AFL.train[,c("Disposals", "Marks", "Behinds", "Inside50s", "Clearances", "Clangers", "ContendedPossessions", "ContestedMarks", "MarksInside50")], AFL.test[,c("Disposals", "Marks", "Behinds", "Inside50s", "Clearances", "Clangers", "ContendedPossessions", "ContestedMarks", "MarksInside50")], AFL.train$WinLoss,
k = 3)
table(AFL.knn.pred, AFL.test$WinLoss)
(143+168)/(143+39+64+168)
unique(AFL_clean$Venue)
nrow(AFL_clean[AFL_clean$Venue == "Traeger Park",])
table(AFL_clean$Venue)
table_all <- table(AFL_clean$Team, AFL_clean$WinLoss) # wins and losses for each team
# creating tables of each team and thier wins/losses against opposition
table_ad <- table(ad_clean$Opposition, ad_clean$WinLoss)
table_wc <- table(wc_clean$Opposition, wc_clean$WinLoss)
table_bl <- table(bl_clean$Opposition, bl_clean$WinLoss)
table_clt <- table(clt_clean$Opposition, clt_clean$WinLoss)
table_clwd <- table(clwd_clean$Opposition, clwd_clean$WinLoss)
table_esdn <- table(esdn_clean$Opposition, esdn_clean$WinLoss)
table_fd <- table(fd_clean$Opposition, fd_clean$WinLoss)
table_gc <- table(gc_clean$Opposition, gc_clean$WinLoss)
table_glng <- table(glng_clean$Opposition, glng_clean$WinLoss)
table_gws <- table(gws_clean$Opposition, gws_clean$WinLoss)
# Plotting Mosaics
library(vcd)
mosaicplot(table_all, col=c("red","darkgreen"), main = "Wins&Losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
mosaicplot(table_wc, col=c("red","darkgreen"),main = "West Coast and their wins and losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
mosaicplot(table_ad, col=c("red","darkgreen"), main = "Adelaide and their wins and losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
mosaicplot(table_bl, col=c("red","darkgreen"),main = "Brisbane Lions and their wins and losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
mosaicplot(table_clt, col=c("red","darkgreen"),main = "Carlton and their wins and losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
mosaicplot(table_clwd, col=c("red","darkgreen"),main = "Collingwood and their wins and losses against each team (2012-2016)", xlab = "Team", ylab = "Win or loss")
all_freq <- as.data.frame.matrix(table_all)
all_freq <- all_freq[order(all_freq$`1`, decreasing = TRUE),]
all_freq
mosaicplot(all_freq, col=c("red","darkgreen"), main = "Win vs Loss Frequency for each team (2012-2016)", cex.axis = 1.2)
win_perc <- all_freq$`1` / (all_freq$`0` + all_freq$`1`)
loss_perc <- 1 - win_perc
all_perc <- cbind(all_freq, loss_perc, win_perc)
all_perc <- all_perc[3:4]
colnames(all_perc) <- c("Loss Percentage", "Win Percentage")
all_perc
mosaicplot(all_perc, col=c("red","darkgreen"), main = "Win vs Loss Percentage for each team (2012-2016)", cex.axis = 1.2)
all_perc_decr <- all_perc[order(all_perc$`Win Percentage`, decreasing = TRUE),]
all_perc_decr
mosaicplot(all_perc_decr, col=c(rgb(215/255,25/255,28/255),rgb(166/255,217/255,106/255)), main = "Total Win vs Loss Percentage for each team (2012-2016) [Descending Order]", cex.axis = 1.2)
library(lattice)
barchart(AFL_clean$Score ~ AFL_clean$Team | AFL_clean$Venue)
venueFilter <- function(venue){
AFL_venue <- AFL_clean[AFL_clean$Venue == venue,]
venue_table <- table(AFL_venue$Team, AFL_venue$WinLoss)
venue_df <- as.data.frame.matrix(venue_table)
venue_freq <- venue_df[order(venue_df$`1`, decreasing = TRUE),]
win_perc <- venue_freq$`1` / (venue_freq$`0` + venue_freq$`1`)
loss_perc <- 1 - win_perc
all_perc <- cbind(venue_freq, loss_perc, win_perc)
all_perc <- all_perc[3:4]
colnames(all_perc) <- c("Loss Percentage", "Win Percentage")
#all_perc
team_total <- (venue_freq$`0` + venue_freq$`1`)
game_totals <- sum(team_total)
team_proportion <- team_total / game_totals
venue_freq <- cbind(venue_freq, team_proportion)
#venue_freq
all_venue <- cbind(venue_freq, all_perc)
all_venue <- all_venue[order(all_venue$`Win Percentage`, decreasing = TRUE),]
return(all_venue)
}
subiaco <- venueFilter("Subiaco")
syd.showground <- venueFilter("Sydney Showground")
gabba <- venueFilter("Gabba")
mcg <- venueFilter("M.C.G.")
scg <- venueFilter("S.C.G.")
adl.oval <- venueFilter("Adelaide Oval")
carrara <- venueFilter("Carrara")
docklands <- venueFilter("Docklands")
ftbl.park <- venueFilter("Football Park")
sum(gabba[1] + gabba[2])
gabba
library(ggplot2)
gabba.tprop <- gabba[3]
bp <- ggplot(gabba.tprop, aes(x="", y=team_proportion, fill=row.names(gabba.tprop)) ) + geom_bar(width = 1, stat = "identity") + scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9", "#8fc0a0", "#cc9b5b", "#71529b", "#7bf68d", "#96270f", "#1e46be", "#b60edc", "#678dbd", "#4eb20b", "#5f37e5", "#02339b", "#a7a4df", "#2b681e", "#2c3f7c", "#d9ed8f"))
bp
pie <- bp + coord_polar("y", start=0)
pie
gabba.wperc <- gabba[4:5]
#subiaco.wperc
mosaicplot(gabba.wperc, col=c(rgb(215/255,25/255,28/255),rgb(166/255,217/255,106/255)), main = "Total Win vs Loss Percentage for each team at Subiaco Oval (2012-2016) [Descending Order]", cex.axis = 1.2)
