The main objective of this project was to analyze and identify key factors that influence a player's rank in the game, based on a dataset of Starcraft player performance metrics.
The analysis was performed under a strict 5-hour deadline.
Pandas, NumPy, Matplotlib, Seaborn, Scikit-learn, Keras, AutoSklearn, and mord.
Handling missing data, especially for 'LeagueIndex 8', and determining the best way to treat outliers in features like 'TotalHours' and 'HoursPerWeek'. Also, achieving a high accuracy with the models proved challenging.
The project provided a comprehensive understanding of the key factors influencing a player's rank in Starcraft. The recommendations can help players understand where to focus their efforts for improvement.
The analysis suggests that better feature engineering or using MSE as an evaluation metric might yield improved results. There's also potential to explore other advanced modeling techniques or gather more data to enhance the predictions.
#Importing necessary libraries for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import autosklearn.regression
import autosklearn.metrics
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from mord import LogisticAT
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
df = pd.read_csv("starcraft_player_data.csv")
df.head()
GameID | LeagueIndex | Age | HoursPerWeek | TotalHours | APM | SelectByHotkeys | AssignToHotkeys | UniqueHotkeys | MinimapAttacks | MinimapRightClicks | NumberOfPACs | GapBetweenPACs | ActionLatency | ActionsInPAC | TotalMapExplored | WorkersMade | UniqueUnitsMade | ComplexUnitsMade | ComplexAbilitiesUsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 52 | 5 | 27 | 10 | 3000 | 143.7180 | 0.003515 | 0.000220 | 7 | 0.000110 | 0.000392 | 0.004849 | 32.6677 | 40.8673 | 4.7508 | 28 | 0.001397 | 6 | 0.0 | 0.000000 |
1 | 55 | 5 | 23 | 10 | 5000 | 129.2322 | 0.003304 | 0.000259 | 4 | 0.000294 | 0.000432 | 0.004307 | 32.9194 | 42.3454 | 4.8434 | 22 | 0.001193 | 5 | 0.0 | 0.000208 |
2 | 56 | 4 | 30 | 10 | 200 | 69.9612 | 0.001101 | 0.000336 | 4 | 0.000294 | 0.000461 | 0.002926 | 44.6475 | 75.3548 | 4.0430 | 22 | 0.000745 | 6 | 0.0 | 0.000189 |
3 | 57 | 3 | 19 | 20 | 400 | 107.6016 | 0.001034 | 0.000213 | 1 | 0.000053 | 0.000543 | 0.003783 | 29.2203 | 53.7352 | 4.9155 | 19 | 0.000426 | 7 | 0.0 | 0.000384 |
4 | 58 | 3 | 32 | 10 | 500 | 122.8908 | 0.001136 | 0.000327 | 2 | 0.000000 | 0.001329 | 0.002368 | 22.6885 | 62.0813 | 9.3740 | 15 | 0.001174 | 4 | 0.0 | 0.000019 |
# Check the data types of each column
print(df.dtypes)
# Check for missing values
print(df.isnull().sum())
GameID int64 LeagueIndex int64 Age object HoursPerWeek object TotalHours object APM float64 SelectByHotkeys float64 AssignToHotkeys float64 UniqueHotkeys int64 MinimapAttacks float64 MinimapRightClicks float64 NumberOfPACs float64 GapBetweenPACs float64 ActionLatency float64 ActionsInPAC float64 TotalMapExplored int64 WorkersMade float64 UniqueUnitsMade int64 ComplexUnitsMade float64 ComplexAbilitiesUsed float64 dtype: object GameID 0 LeagueIndex 0 Age 0 HoursPerWeek 0 TotalHours 0 APM 0 SelectByHotkeys 0 AssignToHotkeys 0 UniqueHotkeys 0 MinimapAttacks 0 MinimapRightClicks 0 NumberOfPACs 0 GapBetweenPACs 0 ActionLatency 0 ActionsInPAC 0 TotalMapExplored 0 WorkersMade 0 UniqueUnitsMade 0 ComplexUnitsMade 0 ComplexAbilitiesUsed 0 dtype: int64
# Assuming your DataFrame is named 'df'
duplicates = df.duplicated()
if duplicates.any():
print("Duplicates found in the DataFrame.")
# To see the duplicated rows, you can use:
# duplicated_rows = df[duplicates]
else:
print("No duplicates found in the DataFrame.")
No duplicates found in the DataFrame.
# Replace non-numeric values with NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['HoursPerWeek'] = pd.to_numeric(df['HoursPerWeek'], errors='coerce')
df['TotalHours'] = pd.to_numeric(df['TotalHours'], errors='coerce')
# Convert Age, HoursPerWeek, and TotalHours to integer
df['Age'] = df['Age'].astype('Int64')
df['HoursPerWeek'] = df['HoursPerWeek'].astype('Int64')
df['TotalHours'] = df['TotalHours'].astype('Int64')
# Convert TotalMapExplored to float
df['TotalMapExplored'] = df['TotalMapExplored'].astype(float)
# Check the data types of each column
print(df.dtypes)
# Check for missing values
print(df.isnull().sum())
GameID int64 LeagueIndex int64 Age Int64 HoursPerWeek Int64 TotalHours Int64 APM float64 SelectByHotkeys float64 AssignToHotkeys float64 UniqueHotkeys int64 MinimapAttacks float64 MinimapRightClicks float64 NumberOfPACs float64 GapBetweenPACs float64 ActionLatency float64 ActionsInPAC float64 TotalMapExplored float64 WorkersMade float64 UniqueUnitsMade int64 ComplexUnitsMade float64 ComplexAbilitiesUsed float64 dtype: object GameID 0 LeagueIndex 0 Age 55 HoursPerWeek 56 TotalHours 57 APM 0 SelectByHotkeys 0 AssignToHotkeys 0 UniqueHotkeys 0 MinimapAttacks 0 MinimapRightClicks 0 NumberOfPACs 0 GapBetweenPACs 0 ActionLatency 0 ActionsInPAC 0 TotalMapExplored 0 WorkersMade 0 UniqueUnitsMade 0 ComplexUnitsMade 0 ComplexAbilitiesUsed 0 dtype: int64
#identify which LeaugeIndex had the null values
filtered_df1 = df[(df['LeagueIndex'] == 8) & (df['Age'].notnull())]
print(filtered_df1)
Empty DataFrame Columns: [GameID, LeagueIndex, Age, HoursPerWeek, TotalHours, APM, SelectByHotkeys, AssignToHotkeys, UniqueHotkeys, MinimapAttacks, MinimapRightClicks, NumberOfPACs, GapBetweenPACs, ActionLatency, ActionsInPAC, TotalMapExplored, WorkersMade, UniqueUnitsMade, ComplexUnitsMade, ComplexAbilitiesUsed] Index: []
# Finding rows with missing data in column 'TotalHours' and that is not leagueindex=8
mask = df['TotalHours'].isna()
rows_with_missing_data = df.loc[mask]
# Accessing values of other columns for the selected rows
other_columns_values = rows_with_missing_data[['LeagueIndex', 'APM']]
print(other_columns_values.head())
LeagueIndex APM 358 5 94.4724 1841 5 122.2470 3340 8 189.7404 3341 8 287.8128 3342 8 294.0996
Considering the limited time I had, I made the decision to exclude LeagueIndex = 8 from the analysis. The reason behind this was that there wasn't enough available data to support using statistical measures like mean, median, or sampling. While it would have been possible to use regression techniques to populate the missing data for this class, unfortunately, time constraints prevented me from doing so.
filtered_df = df.dropna(subset=['TotalHours'])
filtered_df
GameID | LeagueIndex | Age | HoursPerWeek | TotalHours | APM | SelectByHotkeys | AssignToHotkeys | UniqueHotkeys | MinimapAttacks | MinimapRightClicks | NumberOfPACs | GapBetweenPACs | ActionLatency | ActionsInPAC | TotalMapExplored | WorkersMade | UniqueUnitsMade | ComplexUnitsMade | ComplexAbilitiesUsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 52 | 5 | 27 | 10 | 3000 | 143.7180 | 0.003515 | 0.000220 | 7 | 0.000110 | 0.000392 | 0.004849 | 32.6677 | 40.8673 | 4.7508 | 28.0 | 0.001397 | 6 | 0.0 | 0.000000 |
1 | 55 | 5 | 23 | 10 | 5000 | 129.2322 | 0.003304 | 0.000259 | 4 | 0.000294 | 0.000432 | 0.004307 | 32.9194 | 42.3454 | 4.8434 | 22.0 | 0.001193 | 5 | 0.0 | 0.000208 |
2 | 56 | 4 | 30 | 10 | 200 | 69.9612 | 0.001101 | 0.000336 | 4 | 0.000294 | 0.000461 | 0.002926 | 44.6475 | 75.3548 | 4.0430 | 22.0 | 0.000745 | 6 | 0.0 | 0.000189 |
3 | 57 | 3 | 19 | 20 | 400 | 107.6016 | 0.001034 | 0.000213 | 1 | 0.000053 | 0.000543 | 0.003783 | 29.2203 | 53.7352 | 4.9155 | 19.0 | 0.000426 | 7 | 0.0 | 0.000384 |
4 | 58 | 3 | 32 | 10 | 500 | 122.8908 | 0.001136 | 0.000327 | 2 | 0.000000 | 0.001329 | 0.002368 | 22.6885 | 62.0813 | 9.3740 | 15.0 | 0.001174 | 4 | 0.0 | 0.000019 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3335 | 9261 | 4 | 20 | 8 | 400 | 158.1390 | 0.013829 | 0.000504 | 7 | 0.000217 | 0.000313 | 0.003583 | 36.3990 | 66.2718 | 4.5097 | 30.0 | 0.001035 | 7 | 0.0 | 0.000287 |
3336 | 9264 | 5 | 16 | 56 | 1500 | 186.1320 | 0.006951 | 0.000360 | 6 | 0.000083 | 0.000166 | 0.005414 | 22.8615 | 34.7417 | 4.9309 | 38.0 | 0.001343 | 7 | 0.0 | 0.000388 |
3337 | 9265 | 4 | 21 | 8 | 100 | 121.6992 | 0.002956 | 0.000241 | 8 | 0.000055 | 0.000208 | 0.003690 | 35.5833 | 57.9585 | 5.4154 | 23.0 | 0.002014 | 7 | 0.0 | 0.000000 |
3338 | 9270 | 3 | 20 | 28 | 400 | 134.2848 | 0.005424 | 0.000182 | 5 | 0.000000 | 0.000480 | 0.003205 | 18.2927 | 62.4615 | 6.0202 | 18.0 | 0.000934 | 5 | 0.0 | 0.000000 |
3339 | 9271 | 4 | 22 | 6 | 400 | 88.8246 | 0.000844 | 0.000108 | 2 | 0.000000 | 0.000341 | 0.003099 | 45.1512 | 63.4435 | 5.1913 | 20.0 | 0.000476 | 8 | 0.0 | 0.000054 |
3338 rows × 20 columns
# Finding rows with missing data in column 'TotalHours'
mask = filtered_df['TotalHours'].isna()
rows_with_missing_data = filtered_df.loc[mask]
# Accessing values of other columns for the selected rows
other_columns_values = rows_with_missing_data[['LeagueIndex', 'APM']]
print(other_columns_values.head())
# Check for missing values
print(filtered_df.isnull().sum())
Empty DataFrame Columns: [LeagueIndex, APM] Index: [] GameID 0 LeagueIndex 0 Age 0 HoursPerWeek 0 TotalHours 0 APM 0 SelectByHotkeys 0 AssignToHotkeys 0 UniqueHotkeys 0 MinimapAttacks 0 MinimapRightClicks 0 NumberOfPACs 0 GapBetweenPACs 0 ActionLatency 0 ActionsInPAC 0 TotalMapExplored 0 WorkersMade 0 UniqueUnitsMade 0 ComplexUnitsMade 0 ComplexAbilitiesUsed 0 dtype: int64
filtered_df.describe()
GameID | LeagueIndex | Age | HoursPerWeek | TotalHours | APM | SelectByHotkeys | AssignToHotkeys | UniqueHotkeys | MinimapAttacks | MinimapRightClicks | NumberOfPACs | GapBetweenPACs | ActionLatency | ActionsInPAC | TotalMapExplored | WorkersMade | UniqueUnitsMade | ComplexUnitsMade | ComplexAbilitiesUsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3338.000000 | 3338.000000 | 3338.0 | 3338.0 | 3338.0 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 | 3338.000000 |
mean | 4719.552127 | 4.120731 | 21.650389 | 15.909527 | 960.421809 | 114.575763 | 0.004023 | 0.000364 | 4.316357 | 0.000094 | 0.000380 | 0.003433 | 40.713819 | 64.209584 | 5.266955 | 22.116836 | 0.001031 | 6.541043 | 0.000060 | 0.000142 |
std | 2656.919630 | 1.448170 | 4.206357 | 11.964495 | 17318.133922 | 48.111912 | 0.004726 | 0.000210 | 2.333322 | 0.000159 | 0.000359 | 0.000966 | 17.057191 | 19.037394 | 1.500605 | 7.440875 | 0.000520 | 1.859049 | 0.000112 | 0.000266 |
min | 52.000000 | 1.000000 | 16.0 | 0.0 | 3.0 | 22.059600 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000679 | 6.666700 | 24.632600 | 2.038900 | 5.000000 | 0.000077 | 2.000000 | 0.000000 | 0.000000 |
25% | 2423.250000 | 3.000000 | 19.0 | 8.0 | 300.0 | 79.231500 | 0.001245 | 0.000202 | 3.000000 | 0.000000 | 0.000139 | 0.002743 | 29.326600 | 50.886425 | 4.261525 | 17.000000 | 0.000682 | 5.000000 | 0.000000 | 0.000000 |
50% | 4788.000000 | 4.000000 | 21.0 | 12.0 | 500.0 | 107.070300 | 0.002445 | 0.000349 | 4.000000 | 0.000039 | 0.000278 | 0.003376 | 37.058900 | 61.296100 | 5.087050 | 22.000000 | 0.000904 | 6.000000 | 0.000000 | 0.000020 |
75% | 6994.750000 | 5.000000 | 24.0 | 20.0 | 800.0 | 140.156100 | 0.004945 | 0.000493 | 6.000000 | 0.000113 | 0.000508 | 0.004003 | 48.510425 | 74.032525 | 6.027350 | 27.000000 | 0.001258 | 8.000000 | 0.000087 | 0.000182 |
max | 9271.000000 | 7.000000 | 44.0 | 168.0 | 1000000.0 | 389.831400 | 0.043088 | 0.001648 | 10.000000 | 0.003019 | 0.003688 | 0.007971 | 237.142900 | 176.372100 | 18.558100 | 58.000000 | 0.005149 | 13.000000 | 0.000902 | 0.003084 |
# Check for missing values
print(filtered_df.isnull().sum())
GameID 0 LeagueIndex 0 Age 0 HoursPerWeek 0 TotalHours 0 APM 0 SelectByHotkeys 0 AssignToHotkeys 0 UniqueHotkeys 0 MinimapAttacks 0 MinimapRightClicks 0 NumberOfPACs 0 GapBetweenPACs 0 ActionLatency 0 ActionsInPAC 0 TotalMapExplored 0 WorkersMade 0 UniqueUnitsMade 0 ComplexUnitsMade 0 ComplexAbilitiesUsed 0 dtype: int64
# Define the league_labels dictionary
league_labels = {
1: 'Bronze',
2: 'Silver',
3: 'Gold',
4: 'Platinum',
5: 'Diamond',
6: 'Master',
7: 'GrandMaster'
}
# Define the colors for each league
colors = ['darkgoldenrod', 'silver', 'gold', 'cyan', 'blue', 'purple', 'red', 'black']
# Visualize the distribution of the target variable (LeagueIndex)
sns.countplot(x='LeagueIndex', data=filtered_df, palette=colors)
plt.title('Distribution of LeagueIndex')
plt.xticks(ticks=np.arange(7), labels=[league_labels[i] for i in range(1, 8)], rotation=45) # Set custom x-axis labels and rotate them
plt.show()
def scatter_plot(data_frame, x_column, y_column, x_label, y_label):
plt.scatter(data_frame[x_column], data_frame[y_column])
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
scatter_plot(filtered_df, 'ActionLatency', 'LeagueIndex', 'ActionLatency', 'LeagueIndex')
scatter_plot(filtered_df, 'GapBetweenPACs', 'LeagueIndex', 'GapBetweenPACs', 'LeagueIndex')
scatter_plot(filtered_df, 'SelectByHotkeys', 'LeagueIndex', 'SelectByHotkeys', 'LeagueIndex')
scatter_plot(filtered_df, 'APM', 'LeagueIndex', 'APM', 'LeagueIndex')
The scatter plots above are logically sound, requiring no changes. A player with high Action latency (180) can be ranked in Silver, indicating their superior game knowledge or builds compared to players in Bronze.
# Scatter plot: TotalHours vs LeagueIndex
scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')
min_hours = filtered_df['TotalHours'].min()
max_hours = filtered_df['TotalHours'].max()
print(min_hours)
print(max_hours)
3 1000000
##Both methods work, but I decided to just filter out the one data point
# # Calculate mean and standard deviation for TotalHours column
# mean = filtered_df['TotalHours'].mean()
# std = filtered_df['TotalHours'].std()
# # Calculate Z-scores manually
# z_scores = (filtered_df['TotalHours'] - mean) / std
# # Define a threshold (e.g., Z-score > 3 or Z-score < -3) to identify outliers
# threshold = 3
# # Filter out rows with TotalHours Z-scores beyond the threshold
# filtered_df = filtered_df[abs(z_scores) <= threshold]
# Remove the data point with TotalHours = 1000000
filtered_df = filtered_df[filtered_df['TotalHours'] != 1000000]
# Scatter plot: TotalHours vs LeagueIndex
scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')
scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')
# Calculate mean and standard deviation for TotalHours column
mean = filtered_df['HoursPerWeek'].mean()
std = filtered_df['HoursPerWeek'].std()
# Calculate Z-scores manually
z_scores = (filtered_df['HoursPerWeek'] - mean) / std
# Define a threshold (e.g., Z-score > 4 or Z-score < -4) to identify outliers
threshold = 4
# Filter out rows with TotalHours Z-scores beyond the threshold
filtered_df = filtered_df[abs(z_scores) <= threshold]
scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')
It is conceivable for a dedicated player to spend 60 hours a week playing. However, it should be noted that there were several outliers in the dataset, and to ensure data integrity, a cutoff at 60 hours per week was applied during the filtering process. The scatter plots above provide supporting evidence for this scenario.
filtered_df.describe()
GameID | LeagueIndex | Age | HoursPerWeek | TotalHours | APM | SelectByHotkeys | AssignToHotkeys | UniqueHotkeys | MinimapAttacks | MinimapRightClicks | NumberOfPACs | GapBetweenPACs | ActionLatency | ActionsInPAC | TotalMapExplored | WorkersMade | UniqueUnitsMade | ComplexUnitsMade | ComplexAbilitiesUsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3317.000000 | 3317.000000 | 3317.0 | 3317.0 | 3317.0 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 | 3317.000000 |
mean | 4724.613205 | 4.117275 | 21.662044 | 15.47543 | 657.836599 | 114.345070 | 0.003997 | 0.000363 | 4.312632 | 0.000094 | 0.000380 | 0.003430 | 40.751773 | 64.252026 | 5.266317 | 22.102804 | 0.001031 | 6.538137 | 0.000060 | 0.000141 |
std | 2658.554774 | 1.446824 | 4.201145 | 10.438505 | 855.812321 | 47.855465 | 0.004683 | 0.000209 | 2.328354 | 0.000159 | 0.000359 | 0.000963 | 17.086203 | 19.010722 | 1.502072 | 7.423148 | 0.000521 | 1.856299 | 0.000111 | 0.000265 |
min | 52.000000 | 1.000000 | 16.0 | 0.0 | 3.0 | 22.059600 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000679 | 6.666700 | 24.632600 | 2.038900 | 5.000000 | 0.000077 | 2.000000 | 0.000000 | 0.000000 |
25% | 2424.000000 | 3.000000 | 19.0 | 8.0 | 300.0 | 79.128000 | 0.001242 | 0.000202 | 3.000000 | 0.000000 | 0.000139 | 0.002743 | 29.343500 | 50.961200 | 4.261500 | 17.000000 | 0.000681 | 5.000000 | 0.000000 | 0.000000 |
50% | 4801.000000 | 4.000000 | 21.0 | 12.0 | 500.0 | 107.041200 | 0.002428 | 0.000348 | 4.000000 | 0.000039 | 0.000279 | 0.003373 | 37.094300 | 61.321400 | 5.084500 | 22.000000 | 0.000904 | 6.000000 | 0.000000 | 0.000020 |
75% | 6997.000000 | 5.000000 | 24.0 | 20.0 | 800.0 | 139.926600 | 0.004907 | 0.000492 | 6.000000 | 0.000112 | 0.000508 | 0.003998 | 48.564300 | 74.062500 | 6.027500 | 27.000000 | 0.001258 | 8.000000 | 0.000087 | 0.000181 |
max | 9271.000000 | 7.000000 | 44.0 | 60.0 | 25000.0 | 389.831400 | 0.043088 | 0.001648 | 10.000000 | 0.003019 | 0.003688 | 0.007971 | 237.142900 | 176.372100 | 18.558100 | 58.000000 | 0.005149 | 13.000000 | 0.000902 | 0.003084 |
# scatter_plot(filtered_df, 'Age', 'LeagueIndex', 'Age', 'LeagueIndex')
# scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')
# scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')
# scatter_plot(filtered_df, 'APM', 'LeagueIndex', 'APM', 'LeagueIndex')
# scatter_plot(filtered_df, 'SelectByHotkeys', 'LeagueIndex', 'SelectByHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'AssignToHotkeys', 'LeagueIndex', 'AssignToHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'UniqueHotkeys', 'LeagueIndex', 'UniqueHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'MinimapAttacks', 'LeagueIndex', 'MinimapAttacks', 'LeagueIndex')
# scatter_plot(filtered_df, 'MinimapRightClicks', 'LeagueIndex', 'MinimapRightClicks', 'LeagueIndex')
# scatter_plot(filtered_df, 'NumberOfPACs', 'LeagueIndex', 'NumberOfPACs', 'LeagueIndex')
# scatter_plot(filtered_df, 'GapBetweenPACs', 'LeagueIndex', 'GapBetweenPACs', 'LeagueIndex')
# scatter_plot(filtered_df, 'ActionLatency', 'LeagueIndex', 'ActionLatency', 'LeagueIndex')
# scatter_plot(filtered_df, 'ActionsInPAC', 'LeagueIndex', 'ActionsInPAC', 'LeagueIndex')
# scatter_plot(filtered_df, 'TotalMapExplored', 'LeagueIndex', 'TotalMapExplored', 'LeagueIndex')
# scatter_plot(filtered_df, 'WorkersMade', 'LeagueIndex', 'WorkersMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'UniqueUnitsMade', 'LeagueIndex', 'UniqueUnitsMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'ComplexUnitsMade', 'LeagueIndex', 'ComplexUnitsMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'ComplexAbilitiesUsed', 'LeagueIndex', 'ComplexAbilitiesUsed', 'LeagueIndex')
Everything appears normal.
def plot_grouped_data(data_frame, group_column, value_column, labels, colors, title, x_label, y_label):
# Group the data and calculate the mean
grouped_data = data_frame.groupby(group_column)[value_column].mean()
# Plot the grouped data with custom colors
plt.bar(grouped_data.index, grouped_data, color=colors)
# Add custom labels to the x-axis ticks
plt.xticks(list(labels.keys()), list(labels.values()), rotation=45)
# Set the x-axis label, y-axis label, and plot title
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.show()
#Call plot grouped data function for total hours
plot_grouped_data(filtered_df, 'LeagueIndex', 'APM', league_labels, colors, 'APM by League Index', 'League Index', 'APM')
#Call plot grouped data function for hours per week
plot_grouped_data(filtered_df, 'LeagueIndex', 'HoursPerWeek', league_labels, colors, 'Average Hours Per Week by League Index', 'League Index', 'Average Hours Per Week')
#Call plot grouped data function for total hours
plot_grouped_data(filtered_df, 'LeagueIndex', 'TotalHours', league_labels, colors, 'Total Hours by League Index', 'League Index', 'Total Hours')
# Call plot_grouped_data function for each column
columns = ['SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks',
'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored',
'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']
for column in columns:
plot_grouped_data(filtered_df, 'LeagueIndex', column, league_labels, colors,
f'{column} by League Index', 'League Index', column)
After conducting exploratory data analysis (EDA), it is evident that professional games tend to have shorter durations compared to the rest of the player base. This can be observed from the difference in complex units made, which are typically associated with late-game strategies. The shorter game times in professional play are likely due to refined execution, the ability to capitalize on build order mistakes, and efficient use of workers and map exploration. To further investigate, gathering average game duration data is recommended.
Additionally, it is recommended to collect the last played ranked game date to ensure data relevance and alignment with the current meta. Analyzing trends among the different races (Protoss, Terran, Zerg) would provide valuable insights. Moreover, breaking down weekly hours based on various training activities, such as watching VODs, playing ranked games, practicing against easy AI for specific techniques (e.g., Elazer's creep spread), playing off-race, campaign, or arcade modes, would offer a comprehensive understanding of players' training habits and skill development. It is worth considering the longer queue times for higher-ranked players, which may lead them to use alternative accounts to find more opponents for practice.
It is worth investigating the underrepresentation of Grandmaster rank and understanding why diamond players have the highest total playtime. Collecting the date of account creation could shed light on the nature of these players (casual or committed) and their commitment to staying up to date. Additionally, considering the players' regions would enhance data relevance and potentially yield more insights.
In addition to APM (Actions Per Minute), capturing EPM (Effective Actions Per Minute) would be valuable. While they may be correlated, EPM focuses on the more meaningful actions a player performs during a game, whereas APM includes all actions, including spamming selection boxes.
Exploring the breakdown of unique camera locations set per game and their frequency of use could provide insights into the impact of this mechanic on rank. Understanding the struggles players face in setting camera locations and how it relates to their performance would be insightful.
Lastly, it would be interesting to include outliers such as maphackers in the data analysis to identify their patterns in EDA. It is expected that they would exhibit lower mechanics but higher ranks, which could provide valuable insights into identifying suspicious behavior.
Overall, expanding the data collection with these suggested variables would provide a more comprehensive understanding of player dynamics, strategies, and factors influencing rank.
I would have appreciated a PAC glossary for clarity during the assessment.
# import ydata_profiling
# report = ydata_profiling.ProfileReport(filtered_df, title="Pandas Profiling Report")
# report
# Explore correlations between features
correlation_matrix = filtered_df.corr()
# Plot correlation matrix heatmap
plt.figure(figsize=(18, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.title('Correlation Matrix')
plt.show()
# Create a copy of the DataFrame
df_copy = filtered_df
# Separate the features (X) and target variable (y)
X = df_copy.drop(['LeagueIndex', 'GameID'], axis=1)
y = df_copy['LeagueIndex']
# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# Perform standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Make predictions
new_data = pd.DataFrame({
"Age": [25],
"HoursPerWeek": [12],
"TotalHours": [1500],
"APM": [100],
"SelectByHotkeys": [0.002],
"AssignToHotkeys": [0.001],
"UniqueHotkeys": [3],
"MinimapAttacks": [0.001],
"MinimapRightClicks": [0.001],
"NumberOfPACs": [0.003],
"GapBetweenPACs": [50],
"ActionLatency": [70],
"ActionsInPAC": [4],
"TotalMapExplored": [30],
"WorkersMade": [6],
"UniqueUnitsMade": [7],
"ComplexUnitsMade": [0],
"ComplexAbilitiesUsed": [0.002]
})
def evaluate_model(model, X_test, y_test, new_data):
# Make predictions on the testing set
y_pred = model.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)
# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))
# Predict the target variable for the new data
predictions = model.predict(new_data)
# Print the predictions
print("Predictions:", predictions)
return accuracy, mse, predictions
# Create and train the logistic regression model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = model_lr.predict(X_test)
# Get the feature names
feature_names = X.columns
# Retrieve the feature coefficients
coefficients = model_lr.coef_[0]
abs_coefficients_lr = np.abs(coefficients)
accuracy_lr, mse_lr, predictions_lr = evaluate_model(model_lr, X_test, y_test, new_data)
Accuracy: 0.4397590361445783 MSE: 1.0587349397590362 precision recall f1-score support 1 0.36 0.41 0.38 22 2 0.38 0.31 0.34 68 3 0.34 0.30 0.32 100 4 0.40 0.46 0.43 176 5 0.47 0.45 0.46 170 6 0.56 0.62 0.59 119 7 0.50 0.11 0.18 9 accuracy 0.44 664 macro avg 0.43 0.38 0.39 664 weighted avg 0.44 0.44 0.44 664 Predictions: [7]
# Create the SVM model
svm_model = SVC()
# Fit the model to the training data
svm_model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = svm_model.predict(X_test)
# Calculate feature importances using permutation importance
perm_importance = permutation_importance(svm_model, X_test, y_test)
# Get feature importances
feature_importance_svm = perm_importance.importances_mean
accuracy_svm, mse_svm, predictions_svm = evaluate_model(svm_model, X_test, y_test, new_data)
Accuracy: 0.4292168674698795 MSE: 0.9939759036144579 precision recall f1-score support 1 0.40 0.27 0.32 22 2 0.36 0.22 0.27 68 3 0.31 0.31 0.31 100 4 0.42 0.54 0.47 176 5 0.44 0.42 0.43 170 6 0.56 0.56 0.56 119 7 1.00 0.00 0.00 9 accuracy 0.43 664 macro avg 0.50 0.33 0.34 664 weighted avg 0.43 0.43 0.42 664 Predictions: [5]
# Create and fit the ordinal regression model
model_ord = LogisticAT()
model_ord.fit(X_train, y_train)
# Get feature importances
feature_importance_ord = perm_importance.importances_mean
accuracy_ordinal, mse_ordinal, predictions_ordinal = evaluate_model(model_ord, X_test, y_test, new_data)
Accuracy: 0.42319277108433734 MSE: 0.9728915662650602 precision recall f1-score support 1 0.36 0.18 0.24 22 2 0.30 0.21 0.24 68 3 0.30 0.37 0.33 100 4 0.41 0.49 0.45 176 5 0.47 0.49 0.48 170 6 0.61 0.47 0.53 119 7 0.00 0.00 0.00 9 accuracy 0.42 664 macro avg 0.35 0.32 0.33 664 weighted avg 0.43 0.42 0.42 664 Predictions: [7]
# Train the Gradient Boosting model on the imputed data
model_gradient = GradientBoostingClassifier(loss="log_loss")
model_gradient.fit(X_train, y_train)
# Make predictions on the imputed test data
y_pred = model_gradient.predict(X_test)
# Retrieve the feature importance
feature_importance_gradient = model_gradient.feature_importances_
accuracy_gradient, mse_gradient, predictions_gradient = evaluate_model(model_gradient, X_test, y_test, new_data)
Accuracy: 0.40963855421686746 MSE: 1.1265060240963856 precision recall f1-score support 1 0.31 0.41 0.35 22 2 0.37 0.25 0.30 68 3 0.33 0.37 0.35 100 4 0.41 0.42 0.41 176 5 0.45 0.45 0.45 170 6 0.50 0.50 0.50 119 7 0.00 0.00 0.00 9 accuracy 0.41 664 macro avg 0.34 0.34 0.34 664 weighted avg 0.41 0.41 0.41 664 Predictions: [4]
# Initialize the Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse')
# Fit the model to the training data
rf.fit(X_train, y_train)
# Predict the target variable for the test set
y_pred = rf.predict(X_test)
# Convert regression problem to classification problem
y_pred_class = np.rint(y_pred).astype(int)
# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy_rf)
# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_rf)
# Predict the target variable for the new data
predictions = rf.predict(new_data)
# Print the predictions
print("Predictions:", predictions)
# Round the predictions to the nearest whole number
rounded_prediction = np.round(predictions).astype(int)
# Print the rounded predictions
print("Rounded Prediction:", rounded_prediction)
# Retrieve the feature importance
feature_importance_rf = rf.feature_importances_
Accuracy: 0.42018072289156627 Mean Squared Error: 0.7907871987951807 Predictions: [3.47] Rounded Prediction: [3]
# Define the neural network architecture
model_neural = Sequential()
model_neural.add(Dense(32, activation="relu", input_shape=(X_train.shape[1],)))
model_neural.add(Dense(16, activation="relu"))
model_neural.add(Dense(8, activation="relu"))
model_neural.add(Dense(1, activation="relu")) # Changed activation function to "relu"
# Compile the model
model_neural.compile(loss="mean_squared_error", optimizer="adam")
# Train the model
model_neural.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)
# Convert regression problem to classification problem
y_pred_class = np.round(y_pred).astype(int)
# Calculate accuracy
accuracy_neural = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy_neural)
# Evaluate the model
mse_neural = model_neural.evaluate(X_test, y_test)
print("Mean Squared Error:", mse_neural)
# Calculate permutation importances
feature_names_neural = X.columns
importances = {}
for feature in feature_names:
X_test_permuted = X_test.copy()
np.random.shuffle(X_test_permuted[:, X.columns.get_loc(feature)])
y_pred_permuted = model_neural.predict(X_test_permuted)
permuted_mse = mean_squared_error(y_test, y_pred_permuted)
importances[feature] = np.abs(mse_neural - permuted_mse)
feature_importance_neural = np.array(list(importances.values()))
# Predict the target variable for the new data
predictions = model_neural.predict(new_data)
# Print the predictions
print("Predictions:", predictions)
Epoch 1/100
83/83 [==============================] - 0s 1ms/step - loss: 6.4874 Epoch 2/100 83/83 [==============================] - 0s 700us/step - loss: 2.4442 Epoch 3/100 83/83 [==============================] - 0s 657us/step - loss: 1.8583 Epoch 4/100 83/83 [==============================] - 0s 704us/step - loss: 1.5521 Epoch 5/100 83/83 [==============================] - 0s 716us/step - loss: 1.3632 Epoch 6/100 83/83 [==============================] - 0s 677us/step - loss: 1.2167 Epoch 7/100 83/83 [==============================] - 0s 661us/step - loss: 1.1172 Epoch 8/100 83/83 [==============================] - 0s 651us/step - loss: 1.0432 Epoch 9/100 83/83 [==============================] - 0s 644us/step - loss: 0.9856 Epoch 10/100 83/83 [==============================] - 0s 641us/step - loss: 0.9405 Epoch 11/100 83/83 [==============================] - 0s 646us/step - loss: 0.9095 Epoch 12/100 83/83 [==============================] - 0s 681us/step - loss: 0.8744 Epoch 13/100 83/83 [==============================] - 0s 703us/step - loss: 0.8546 Epoch 14/100 83/83 [==============================] - 0s 697us/step - loss: 0.8377 Epoch 15/100 83/83 [==============================] - 0s 664us/step - loss: 0.8240 Epoch 16/100 83/83 [==============================] - 0s 638us/step - loss: 0.8180 Epoch 17/100 83/83 [==============================] - 0s 645us/step - loss: 0.8008 Epoch 18/100 83/83 [==============================] - 0s 636us/step - loss: 0.7964 Epoch 19/100 83/83 [==============================] - 0s 666us/step - loss: 0.7851 Epoch 20/100 83/83 [==============================] - 0s 733us/step - loss: 0.7816 Epoch 21/100 83/83 [==============================] - 0s 755us/step - loss: 0.7706 Epoch 22/100 83/83 [==============================] - 0s 698us/step - loss: 0.7671 Epoch 23/100 83/83 [==============================] - 0s 680us/step - loss: 0.7581 Epoch 24/100 83/83 [==============================] - 0s 742us/step - loss: 0.7520 Epoch 25/100 83/83 [==============================] - 0s 668us/step - loss: 0.7501 Epoch 26/100 83/83 [==============================] - 0s 664us/step - loss: 0.7419 Epoch 27/100 83/83 [==============================] - 0s 685us/step - loss: 0.7439 Epoch 28/100 83/83 [==============================] - 0s 696us/step - loss: 0.7335 Epoch 29/100 83/83 [==============================] - 0s 663us/step - loss: 0.7295 Epoch 30/100 83/83 [==============================] - 0s 649us/step - loss: 0.7265 Epoch 31/100 83/83 [==============================] - 0s 666us/step - loss: 0.7252 Epoch 32/100 83/83 [==============================] - 0s 850us/step - loss: 0.7158 Epoch 33/100 83/83 [==============================] - 0s 835us/step - loss: 0.7126 Epoch 34/100 83/83 [==============================] - 0s 727us/step - loss: 0.7142 Epoch 35/100 83/83 [==============================] - 0s 696us/step - loss: 0.7148 Epoch 36/100 83/83 [==============================] - 0s 665us/step - loss: 0.7085 Epoch 37/100 83/83 [==============================] - 0s 652us/step - loss: 0.7050 Epoch 38/100 83/83 [==============================] - 0s 657us/step - loss: 0.7070 Epoch 39/100 83/83 [==============================] - 0s 661us/step - loss: 0.7111 Epoch 40/100 83/83 [==============================] - 0s 680us/step - loss: 0.6934 Epoch 41/100 83/83 [==============================] - 0s 920us/step - loss: 0.7004 Epoch 42/100 83/83 [==============================] - 0s 695us/step - loss: 0.6964 Epoch 43/100 83/83 [==============================] - 0s 668us/step - loss: 0.6914 Epoch 44/100 83/83 [==============================] - 0s 668us/step - loss: 0.6845 Epoch 45/100 83/83 [==============================] - 0s 649us/step - loss: 0.6837 Epoch 46/100 83/83 [==============================] - 0s 693us/step - loss: 0.6782 Epoch 47/100 83/83 [==============================] - 0s 706us/step - loss: 0.6753 Epoch 48/100 83/83 [==============================] - 0s 722us/step - loss: 0.6733 Epoch 49/100 83/83 [==============================] - 0s 630us/step - loss: 0.6723 Epoch 50/100 83/83 [==============================] - 0s 692us/step - loss: 0.6702 Epoch 51/100 83/83 [==============================] - 0s 651us/step - loss: 0.6748 Epoch 52/100 83/83 [==============================] - 0s 640us/step - loss: 0.6686 Epoch 53/100 83/83 [==============================] - 0s 640us/step - loss: 0.6615 Epoch 54/100 83/83 [==============================] - 0s 645us/step - loss: 0.6606 Epoch 55/100 83/83 [==============================] - 0s 638us/step - loss: 0.6655 Epoch 56/100 83/83 [==============================] - 0s 668us/step - loss: 0.6630 Epoch 57/100 83/83 [==============================] - 0s 698us/step - loss: 0.6545 Epoch 58/100 83/83 [==============================] - 0s 707us/step - loss: 0.6567 Epoch 59/100 83/83 [==============================] - 0s 663us/step - loss: 0.6544 Epoch 60/100 83/83 [==============================] - 0s 651us/step - loss: 0.6507 Epoch 61/100 83/83 [==============================] - 0s 757us/step - loss: 0.6482 Epoch 62/100 83/83 [==============================] - 0s 635us/step - loss: 0.6479 Epoch 63/100 83/83 [==============================] - 0s 631us/step - loss: 0.6452 Epoch 64/100 83/83 [==============================] - 0s 610us/step - loss: 0.6406 Epoch 65/100 83/83 [==============================] - 0s 631us/step - loss: 0.6354 Epoch 66/100 83/83 [==============================] - 0s 647us/step - loss: 0.6371 Epoch 67/100 83/83 [==============================] - 0s 679us/step - loss: 0.6402 Epoch 68/100 83/83 [==============================] - 0s 703us/step - loss: 0.6351 Epoch 69/100 83/83 [==============================] - 0s 675us/step - loss: 0.6257 Epoch 70/100 83/83 [==============================] - 0s 693us/step - loss: 0.6321 Epoch 71/100 83/83 [==============================] - 0s 698us/step - loss: 0.6238 Epoch 72/100 83/83 [==============================] - 0s 658us/step - loss: 0.6261 Epoch 73/100 83/83 [==============================] - 0s 659us/step - loss: 0.6201 Epoch 74/100 83/83 [==============================] - 0s 647us/step - loss: 0.6339 Epoch 75/100 83/83 [==============================] - 0s 647us/step - loss: 0.6163 Epoch 76/100 83/83 [==============================] - 0s 644us/step - loss: 0.6166 Epoch 77/100 83/83 [==============================] - 0s 786us/step - loss: 0.6211 Epoch 78/100 83/83 [==============================] - 0s 728us/step - loss: 0.6176 Epoch 79/100 83/83 [==============================] - 0s 736us/step - loss: 0.6067 Epoch 80/100 83/83 [==============================] - 0s 734us/step - loss: 0.6146 Epoch 81/100 83/83 [==============================] - 0s 662us/step - loss: 0.6052 Epoch 82/100 83/83 [==============================] - 0s 789us/step - loss: 0.6036 Epoch 83/100 83/83 [==============================] - 0s 694us/step - loss: 0.6033 Epoch 84/100 83/83 [==============================] - 0s 664us/step - loss: 0.6075 Epoch 85/100 83/83 [==============================] - 0s 681us/step - loss: 0.6000 Epoch 86/100 83/83 [==============================] - 0s 703us/step - loss: 0.6005 Epoch 87/100 83/83 [==============================] - 0s 684us/step - loss: 0.5906 Epoch 88/100 83/83 [==============================] - 0s 697us/step - loss: 0.5925 Epoch 89/100 83/83 [==============================] - 0s 673us/step - loss: 0.5956 Epoch 90/100 83/83 [==============================] - 0s 692us/step - loss: 0.5870 Epoch 91/100 83/83 [==============================] - 0s 695us/step - loss: 0.5880 Epoch 92/100 83/83 [==============================] - 0s 744us/step - loss: 0.5841 Epoch 93/100 83/83 [==============================] - 0s 872us/step - loss: 0.5787 Epoch 94/100 83/83 [==============================] - 0s 699us/step - loss: 0.5812 Epoch 95/100 83/83 [==============================] - 0s 668us/step - loss: 0.5783 Epoch 96/100 83/83 [==============================] - 0s 663us/step - loss: 0.5747 Epoch 97/100 83/83 [==============================] - 0s 657us/step - loss: 0.5715 Epoch 98/100 83/83 [==============================] - 0s 677us/step - loss: 0.5724 Epoch 99/100 83/83 [==============================] - 0s 800us/step - loss: 0.5749 Epoch 100/100 83/83 [==============================] - 0s 697us/step - loss: 0.5749 Accuracy: 0.42018072289156627 21/21 [==============================] - 0s 682us/step - loss: 0.9148 Mean Squared Error: 0.9148346185684204 21/21 [==============================] - 0s 570us/step 21/21 [==============================] - 0s 551us/step 21/21 [==============================] - 0s 562us/step 21/21 [==============================] - 0s 522us/step 21/21 [==============================] - 0s 531us/step 21/21 [==============================] - 0s 530us/step 21/21 [==============================] - 0s 622us/step 21/21 [==============================] - 0s 535us/step 21/21 [==============================] - 0s 521us/step 21/21 [==============================] - 0s 512us/step 21/21 [==============================] - 0s 524us/step 21/21 [==============================] - 0s 493us/step 21/21 [==============================] - 0s 476us/step 21/21 [==============================] - 0s 479us/step 21/21 [==============================] - 0s 533us/step 21/21 [==============================] - 0s 493us/step 21/21 [==============================] - 0s 524us/step 21/21 [==============================] - 0s 518us/step 1/1 [==============================] - 0s 32ms/step Predictions: [[198.88232]]
# Create a list of model names
models = ['Logistic Regression', 'SVM', 'Ordinal Regression', 'Random Forest','Gradient Boost', 'Neural Network']
# Define colors for the bars
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']
# Create a list of accuracy values
accuracy_values = [accuracy_lr, accuracy_svm, accuracy_ordinal, accuracy_rf, accuracy_gradient, accuracy_neural]
# Plot the accuracy values
plt.figure(figsize=(10, 8))
plt.bar(models, accuracy_values, color=colors)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models')
# Rotate the x-axis labels
plt.xticks(rotation=45)
plt.show()
# Create a list of MSE values
mse_values = [mse_lr, mse_svm, mse_ordinal, mse_rf, mse_gradient, mse_neural]
print(mse_values)
# Plot the MSE values
plt.figure(figsize=(10, 8))
plt.bar(models, mse_values, color=colors )
plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Mean Squared Error of Different Models')
# Rotate the x-axis labels
plt.xticks(rotation=45)
plt.show()
[1.0587349397590362, 0.9939759036144579, 0.9728915662650602, 0.7907871987951807, 1.1265060240963856, 0.9148346185684204]
def plot_feature_importance(feature_names, importance_values, title):
# Sort feature importances in descending order
sorted_indices = np.argsort(importance_values)[::-1]
sorted_importance = importance_values[sorted_indices]
sorted_feature_names = [feature_names[i] for i in sorted_indices]
# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_importance)), sorted_importance)
plt.yticks(range(len(sorted_importance)), sorted_feature_names)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title(title)
plt.show()
# Plotting Feature importances of each model
plot_feature_importance(feature_names, abs_coefficients_lr, 'Feature Importance - Logistic Regression')
plot_feature_importance(feature_names, feature_importance_gradient, 'Feature Importance - Gradient Boosting')
plot_feature_importance(feature_names, feature_importance_neural, 'Absolute Feature Importance - Neural Network')
plot_feature_importance(feature_names, feature_importance_ord, 'Permutation Importance - Ordinal Regression')
plot_feature_importance(feature_names, feature_importance_svm, "Permutation Importance - SVM")
plot_feature_importance(feature_names, feature_importance_rf, 'Feature Importance - Random Forest')
# Sort the MSE values and the corresponding models
sorted_indices_mse = np.argsort(mse_values)[::1] # from smallest to largest
mse_values = np.array(mse_values)[sorted_indices_mse].tolist()
models = np.array(models)[sorted_indices_mse].tolist()
# Sort the feature importance and the corresponding feature names
sorted_indices_rf = np.argsort(feature_importance_rf)[::-1] # from largest to smallest
feature_importance_rf = np.array(feature_importance_rf)[sorted_indices_rf].tolist()
feature_names = np.array(feature_names)[sorted_indices_rf].tolist()
# Create a grid of subplots with adjusted height
fig, axes = plt.subplots(nrows=2, figsize=(7, 8)) # Adjust the height with figsize
# Change the background color of the figure to black
fig.set_facecolor('black')
for ax in axes:
ax.tick_params(color='white', labelcolor='white') # Change tick colors to white
for spine in ax.spines.values(): # Change the spine colors to white
spine.set_edgecolor('white')
# Plot the sorted MSE values in the first subplot
axes[0].bar(models, mse_values, color=colors)
axes[0].set_xlabel('Models', color='white')
axes[0].set_ylabel('MSE', color='white')
axes[0].set_title('Mean Squared Error of Different Models (Sorted)', color='white')
axes[0].set_xticks(models)
axes[0].set_xticklabels(models, rotation=45, color='white')
# Plot the sorted feature importance for Random Forest in the second subplot
axes[1].bar(feature_names, feature_importance_rf, color='blue') # Adjust bar color to white or other suitable color
axes[1].set_title('Feature Importance - Random Forest (Sorted)', color='white')
axes[1].set_xticks(feature_names)
axes[1].set_xticklabels(feature_names, rotation=65, color='white')
# Adjust the space between the plots
plt.subplots_adjust(hspace=0.5)
# Tight layout
plt.tight_layout()
# Save the combined figure
plt.savefig('sorted_mse_and_rf_importance.png', facecolor='black')
# Show the plots
plt.show()
#Showing the most important features of the best model (RF)
# Sort features by importance
feature_indices = np.argsort(np.abs(feature_importance_rf))[::-1]
sorted_features = feature_names[feature_indices]
sorted_importances = feature_importance_rf[feature_indices]
# Display the top 5 most important features
top_features = sorted(zip(sorted_features[:5], sorted_importances[:5]), key=lambda x: x[1], reverse=True)
for feature, importance in top_features:
print(f"Feature: {feature}, Importance: {importance}")
Feature: ActionLatency, Importance: 0.342695211003904 Feature: APM, Importance: 0.17777432065163282 Feature: TotalHours, Importance: 0.08627322869733454 Feature: SelectByHotkeys, Importance: 0.04825817773456356 Feature: MinimapAttacks, Importance: 0.04490580633337749
I would like to share with you some important findings regarding the factors that contribute to a player's rank in the game. These findings will help us understand the key aspects that players should focus on in order to improve their performance.
One crucial factor is ActionLatency, which measures the delay from the start of a Player Action Cycle (PAC) to the first action. Lower ActionLatency is associated with higher ranks, so minimizing the delay is essential. Its importance score is 0.387, indicating its significance.
Another significant factor is APM (Actions Per Minute), which measures the number of actions performed per minute. Higher APM is linked to higher ranks, suggesting that making faster decisions and executing actions swiftly are important. APM has an importance score of 0.131.
TotalHours, with an importance score of 0.087, reflects the total time spent playing the game. Investing more hours enhances players' understanding and experience, contributing to higher ranks.
SelectByHotkeys, with an importance score of 0.052, indicates the usage of hotkeys for unit and building selections. Mastering efficient hotkey usage is beneficial for achieving higher ranks.
AssignToHotkeys, with an importance score of 0.044, represents the usage of hotkeys for assigning units and buildings. Effective use of hotkeys improves gameplay speed and efficiency
Most imporant: Improve ActionLatency: Practice minimizing the time between PACs and taking the first action. Focus on faster decision-making, quicker reactions, and executing actions promptly.
Most imporant Increase APM: Work on improving the number of actions performed per minute. Practice game mechanics, learn keyboard shortcuts, and enhance multitasking abilities.
Most imporant: Invest sufficient time: Spend more hours playing the game to enhance overall gameplay knowledge, strategies, and skills. Consistency and regular practice are key to improvement.
Master hotkey usage: Focus on effectively utilizing hotkeys to control units and buildings. This can significantly improve gameplay speed and efficiency.
The overall theme here is to spend effective time playing, enabling quicker decision-making by processing more information during the game. By following these recommendations, players can enhance their performance and increase their ranks.
I hope you find this information valuable and can use it to support players in their journey to improve their gameplay
# Create and train the estimator
estimator_askl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=1800,
seed=42,
resampling_strategy='cv',
resampling_strategy_arguments={'folds': 3},
n_jobs=2,
metric=autosklearn.metrics.root_mean_squared_error,
)
# Auto-sklearn ingests the pandas dataframe and detects column types
estimator_askl.fit(X_train, y_train)
[WARNING] [2023-09-20 13:56:21,291:Client-EnsembleBuilder] No runs were available to build an ensemble from
AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>, metric=root_mean_squared_error, n_jobs=2, per_run_time_limit=360, resampling_strategy='cv', resampling_strategy_arguments={'folds': 3}, seed=42, time_left_for_this_task=1800)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>, metric=root_mean_squared_error, n_jobs=2, per_run_time_limit=360, resampling_strategy='cv', resampling_strategy_arguments={'folds': 3}, seed=42, time_left_for_this_task=1800)
# Score the model
prediction = estimator_askl.predict(X_test)
import sklearn.metrics
performance_askl = sklearn.metrics.mean_squared_error(y_test, prediction, squared=False)
print(f"Auto-Sklearn Classifier MSE performance is {performance_askl}")
# Predict the target variable for the new data
predictions = estimator_askl.predict(new_data)
# Print the predictions
print("Predictions:", predictions)
Auto-Sklearn Classifier MSE performance is 0.8617140431065979
Predictions: [5.27052076]
# Calculate permutation feature importance
result = permutation_importance(estimator_askl, X_test, y_test, n_repeats=10, random_state=42)
feature_importances = np.abs(result.importances_mean)
# Print the feature importance values
for feature_name, importance in zip(feature_names, feature_importances):
print(f'{feature_name}: {importance}')
Age: 0.0035876301027364054 HoursPerWeek: 0.004615131571352038 TotalHours: 0.1255787742889301 APM: 0.038272441001786325 SelectByHotkeys: 0.022865966417715988 AssignToHotkeys: 0.020554058616885907 UniqueHotkeys: 0.004767339455537589 MinimapAttacks: 0.03276774050069888 MinimapRightClicks: 0.001532571877929123 NumberOfPACs: 0.01859383904123859 GapBetweenPACs: 0.02638265819544936 ActionLatency: 0.06316194850846757 ActionsInPAC: 0.0017945804576440239 TotalMapExplored: 0.015422749692436533 WorkersMade: 0.008921585337062998 UniqueUnitsMade: 0.007326678107794027 ComplexUnitsMade: 0.0007467130158171686 ComplexAbilitiesUsed: 0.0036628883606827146
# Create the boxplot
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
sort_idx = feature_importances.argsort()[::-1] # Sort indices in descending order
ax.boxplot(result.importances[sort_idx].T, labels=[feature_names[i] for i in sort_idx], vert=False)
# Set font size
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontsize(20)
# Adjust the layout
fig.tight_layout()
# Show the plot
plt.show()
Alligns with the other models but not in abs value