#Importing necessary libraries for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import autosklearn.regression
import autosklearn.metrics
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from mord import LogisticAT
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer


df = pd.read_csv("starcraft_player_data.csv")


df.head()


# Check the data types of each column
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())

GameID                    int64
LeagueIndex               int64
Age                      object
HoursPerWeek             object
TotalHours               object
APM                     float64
SelectByHotkeys         float64
AssignToHotkeys         float64
UniqueHotkeys             int64
MinimapAttacks          float64
MinimapRightClicks      float64
NumberOfPACs            float64
GapBetweenPACs          float64
ActionLatency           float64
ActionsInPAC            float64
TotalMapExplored          int64
WorkersMade             float64
UniqueUnitsMade           int64
ComplexUnitsMade        float64
ComplexAbilitiesUsed    float64
dtype: object
GameID                  0
LeagueIndex             0
Age                     0
HoursPerWeek            0
TotalHours              0
APM                     0
SelectByHotkeys         0
AssignToHotkeys         0
UniqueHotkeys           0
MinimapAttacks          0
MinimapRightClicks      0
NumberOfPACs            0
GapBetweenPACs          0
ActionLatency           0
ActionsInPAC            0
TotalMapExplored        0
WorkersMade             0
UniqueUnitsMade         0
ComplexUnitsMade        0
ComplexAbilitiesUsed    0
dtype: int64


# Assuming your DataFrame is named 'df'
duplicates = df.duplicated()

if duplicates.any():
    print("Duplicates found in the DataFrame.")
    # To see the duplicated rows, you can use:
    # duplicated_rows = df[duplicates]
else:
    print("No duplicates found in the DataFrame.")

No duplicates found in the DataFrame.


# Replace non-numeric values with NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['HoursPerWeek'] = pd.to_numeric(df['HoursPerWeek'], errors='coerce')
df['TotalHours'] = pd.to_numeric(df['TotalHours'], errors='coerce')

# Convert Age, HoursPerWeek, and TotalHours to integer
df['Age'] = df['Age'].astype('Int64')
df['HoursPerWeek'] = df['HoursPerWeek'].astype('Int64')
df['TotalHours'] = df['TotalHours'].astype('Int64')

# Convert TotalMapExplored to float
df['TotalMapExplored'] = df['TotalMapExplored'].astype(float)


# Check the data types of each column
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())

GameID                    int64
LeagueIndex               int64
Age                       Int64
HoursPerWeek              Int64
TotalHours                Int64
APM                     float64
SelectByHotkeys         float64
AssignToHotkeys         float64
UniqueHotkeys             int64
MinimapAttacks          float64
MinimapRightClicks      float64
NumberOfPACs            float64
GapBetweenPACs          float64
ActionLatency           float64
ActionsInPAC            float64
TotalMapExplored        float64
WorkersMade             float64
UniqueUnitsMade           int64
ComplexUnitsMade        float64
ComplexAbilitiesUsed    float64
dtype: object
GameID                   0
LeagueIndex              0
Age                     55
HoursPerWeek            56
TotalHours              57
APM                      0
SelectByHotkeys          0
AssignToHotkeys          0
UniqueHotkeys            0
MinimapAttacks           0
MinimapRightClicks       0
NumberOfPACs             0
GapBetweenPACs           0
ActionLatency            0
ActionsInPAC             0
TotalMapExplored         0
WorkersMade              0
UniqueUnitsMade          0
ComplexUnitsMade         0
ComplexAbilitiesUsed     0
dtype: int64


#identify which LeaugeIndex had the null values
filtered_df1 = df[(df['LeagueIndex'] == 8) & (df['Age'].notnull())]
print(filtered_df1)

Empty DataFrame
Columns: [GameID, LeagueIndex, Age, HoursPerWeek, TotalHours, APM, SelectByHotkeys, AssignToHotkeys, UniqueHotkeys, MinimapAttacks, MinimapRightClicks, NumberOfPACs, GapBetweenPACs, ActionLatency, ActionsInPAC, TotalMapExplored, WorkersMade, UniqueUnitsMade, ComplexUnitsMade, ComplexAbilitiesUsed]
Index: []


# Finding rows with missing data in column 'TotalHours' and that is not leagueindex=8
mask = df['TotalHours'].isna()
rows_with_missing_data = df.loc[mask]

# Accessing values of other columns for the selected rows
other_columns_values = rows_with_missing_data[['LeagueIndex', 'APM']]

print(other_columns_values.head())

      LeagueIndex       APM
358             5   94.4724
1841            5  122.2470
3340            8  189.7404
3341            8  287.8128
3342            8  294.0996


filtered_df = df.dropna(subset=['TotalHours'])

filtered_df


# Finding rows with missing data in column 'TotalHours'
mask = filtered_df['TotalHours'].isna()
rows_with_missing_data = filtered_df.loc[mask]

# Accessing values of other columns for the selected rows
other_columns_values = rows_with_missing_data[['LeagueIndex', 'APM']]

print(other_columns_values.head())

# Check for missing values
print(filtered_df.isnull().sum())

Empty DataFrame
Columns: [LeagueIndex, APM]
Index: []
GameID                  0
LeagueIndex             0
Age                     0
HoursPerWeek            0
TotalHours              0
APM                     0
SelectByHotkeys         0
AssignToHotkeys         0
UniqueHotkeys           0
MinimapAttacks          0
MinimapRightClicks      0
NumberOfPACs            0
GapBetweenPACs          0
ActionLatency           0
ActionsInPAC            0
TotalMapExplored        0
WorkersMade             0
UniqueUnitsMade         0
ComplexUnitsMade        0
ComplexAbilitiesUsed    0
dtype: int64


filtered_df.describe()


# Check for missing values
print(filtered_df.isnull().sum())

GameID                  0
LeagueIndex             0
Age                     0
HoursPerWeek            0
TotalHours              0
APM                     0
SelectByHotkeys         0
AssignToHotkeys         0
UniqueHotkeys           0
MinimapAttacks          0
MinimapRightClicks      0
NumberOfPACs            0
GapBetweenPACs          0
ActionLatency           0
ActionsInPAC            0
TotalMapExplored        0
WorkersMade             0
UniqueUnitsMade         0
ComplexUnitsMade        0
ComplexAbilitiesUsed    0
dtype: int64


# Define the league_labels dictionary
league_labels = {
    1: 'Bronze',
    2: 'Silver',
    3: 'Gold',
    4: 'Platinum',
    5: 'Diamond',
    6: 'Master',
    7: 'GrandMaster'
}

# Define the colors for each league
colors = ['darkgoldenrod', 'silver', 'gold', 'cyan', 'blue', 'purple', 'red', 'black']

# Visualize the distribution of the target variable (LeagueIndex)
sns.countplot(x='LeagueIndex', data=filtered_df, palette=colors)
plt.title('Distribution of LeagueIndex')
plt.xticks(ticks=np.arange(7), labels=[league_labels[i] for i in range(1, 8)], rotation=45)  # Set custom x-axis labels and rotate them
plt.show()


def scatter_plot(data_frame, x_column, y_column, x_label, y_label):
    plt.scatter(data_frame[x_column], data_frame[y_column])
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()


scatter_plot(filtered_df, 'ActionLatency', 'LeagueIndex', 'ActionLatency', 'LeagueIndex')
scatter_plot(filtered_df, 'GapBetweenPACs', 'LeagueIndex', 'GapBetweenPACs', 'LeagueIndex')
scatter_plot(filtered_df, 'SelectByHotkeys', 'LeagueIndex', 'SelectByHotkeys', 'LeagueIndex')
scatter_plot(filtered_df, 'APM', 'LeagueIndex', 'APM', 'LeagueIndex')


# Scatter plot: TotalHours vs LeagueIndex
scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')


min_hours = filtered_df['TotalHours'].min()
max_hours = filtered_df['TotalHours'].max()
print(min_hours)
print(max_hours)

3
1000000


##Both methods work, but I decided to just filter out the one data point

# # Calculate mean and standard deviation for TotalHours column
# mean = filtered_df['TotalHours'].mean()
# std = filtered_df['TotalHours'].std()

# # Calculate Z-scores manually
# z_scores = (filtered_df['TotalHours'] - mean) / std

# # Define a threshold (e.g., Z-score > 3 or Z-score < -3) to identify outliers
# threshold = 3

# # Filter out rows with TotalHours Z-scores beyond the threshold
# filtered_df = filtered_df[abs(z_scores) <= threshold]

# Remove the data point with TotalHours = 1000000
filtered_df = filtered_df[filtered_df['TotalHours'] != 1000000]


# Scatter plot: TotalHours vs LeagueIndex
scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')


scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')


# Calculate mean and standard deviation for TotalHours column
mean = filtered_df['HoursPerWeek'].mean()
std = filtered_df['HoursPerWeek'].std()

# Calculate Z-scores manually
z_scores = (filtered_df['HoursPerWeek'] - mean) / std

# Define a threshold (e.g., Z-score > 4 or Z-score < -4) to identify outliers
threshold = 4

# Filter out rows with TotalHours Z-scores beyond the threshold
filtered_df = filtered_df[abs(z_scores) <= threshold]

scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')


filtered_df.describe()


# scatter_plot(filtered_df, 'Age', 'LeagueIndex', 'Age', 'LeagueIndex')
# scatter_plot(filtered_df, 'HoursPerWeek', 'LeagueIndex', 'HoursPerWeek', 'LeagueIndex')
# scatter_plot(filtered_df, 'TotalHours', 'LeagueIndex', 'TotalHours', 'LeagueIndex')
# scatter_plot(filtered_df, 'APM', 'LeagueIndex', 'APM', 'LeagueIndex')
# scatter_plot(filtered_df, 'SelectByHotkeys', 'LeagueIndex', 'SelectByHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'AssignToHotkeys', 'LeagueIndex', 'AssignToHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'UniqueHotkeys', 'LeagueIndex', 'UniqueHotkeys', 'LeagueIndex')
# scatter_plot(filtered_df, 'MinimapAttacks', 'LeagueIndex', 'MinimapAttacks', 'LeagueIndex')
# scatter_plot(filtered_df, 'MinimapRightClicks', 'LeagueIndex', 'MinimapRightClicks', 'LeagueIndex')
# scatter_plot(filtered_df, 'NumberOfPACs', 'LeagueIndex', 'NumberOfPACs', 'LeagueIndex')
# scatter_plot(filtered_df, 'GapBetweenPACs', 'LeagueIndex', 'GapBetweenPACs', 'LeagueIndex')
# scatter_plot(filtered_df, 'ActionLatency', 'LeagueIndex', 'ActionLatency', 'LeagueIndex')
# scatter_plot(filtered_df, 'ActionsInPAC', 'LeagueIndex', 'ActionsInPAC', 'LeagueIndex')
# scatter_plot(filtered_df, 'TotalMapExplored', 'LeagueIndex', 'TotalMapExplored', 'LeagueIndex')
# scatter_plot(filtered_df, 'WorkersMade', 'LeagueIndex', 'WorkersMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'UniqueUnitsMade', 'LeagueIndex', 'UniqueUnitsMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'ComplexUnitsMade', 'LeagueIndex', 'ComplexUnitsMade', 'LeagueIndex')
# scatter_plot(filtered_df, 'ComplexAbilitiesUsed', 'LeagueIndex', 'ComplexAbilitiesUsed', 'LeagueIndex')


def plot_grouped_data(data_frame, group_column, value_column, labels, colors, title, x_label, y_label):
    # Group the data and calculate the mean
    grouped_data = data_frame.groupby(group_column)[value_column].mean()

    # Plot the grouped data with custom colors
    plt.bar(grouped_data.index, grouped_data, color=colors)

    # Add custom labels to the x-axis ticks
    plt.xticks(list(labels.keys()), list(labels.values()), rotation=45)

    # Set the x-axis label, y-axis label, and plot title
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

    plt.show()


#Call plot grouped data function for total hours
plot_grouped_data(filtered_df, 'LeagueIndex', 'APM', league_labels, colors, 'APM by League Index', 'League Index', 'APM')


#Call plot grouped data function for hours per week
plot_grouped_data(filtered_df, 'LeagueIndex', 'HoursPerWeek', league_labels, colors, 'Average Hours Per Week by League Index', 'League Index', 'Average Hours Per Week')


#Call plot grouped data function for total hours
plot_grouped_data(filtered_df, 'LeagueIndex', 'TotalHours', league_labels, colors, 'Total Hours by League Index', 'League Index', 'Total Hours')


# Call plot_grouped_data function for each column
columns = ['SelectByHotkeys', 'AssignToHotkeys', 'UniqueHotkeys', 'MinimapAttacks', 'MinimapRightClicks',
           'NumberOfPACs', 'GapBetweenPACs', 'ActionLatency', 'ActionsInPAC', 'TotalMapExplored',
           'WorkersMade', 'UniqueUnitsMade', 'ComplexUnitsMade', 'ComplexAbilitiesUsed']

for column in columns:
    plot_grouped_data(filtered_df, 'LeagueIndex', column, league_labels, colors,
                      f'{column} by League Index', 'League Index', column)


# import ydata_profiling
# report = ydata_profiling.ProfileReport(filtered_df, title="Pandas Profiling Report")
# report


# Explore correlations between features
correlation_matrix = filtered_df.corr()

# Plot correlation matrix heatmap
plt.figure(figsize=(18, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.title('Correlation Matrix')
plt.show()


# Create a copy of the DataFrame
df_copy = filtered_df

# Separate the features (X) and target variable (y)
X = df_copy.drop(['LeagueIndex', 'GameID'], axis=1)
y = df_copy['LeagueIndex']

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Perform standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Make predictions
new_data = pd.DataFrame({
    "Age": [25],
    "HoursPerWeek": [12],
    "TotalHours": [1500],
    "APM": [100],
    "SelectByHotkeys": [0.002],
    "AssignToHotkeys": [0.001],
    "UniqueHotkeys": [3],
    "MinimapAttacks": [0.001],
    "MinimapRightClicks": [0.001],
    "NumberOfPACs": [0.003],
    "GapBetweenPACs": [50],
    "ActionLatency": [70],
    "ActionsInPAC": [4],
    "TotalMapExplored": [30],
    "WorkersMade": [6],
    "UniqueUnitsMade": [7],
    "ComplexUnitsMade": [0],
    "ComplexAbilitiesUsed": [0.002]
})


def evaluate_model(model, X_test, y_test, new_data):
    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    print("MSE:", mse)

    # Evaluate the model
    print(classification_report(y_test, y_pred, zero_division=1))

    # Predict the target variable for the new data
    predictions = model.predict(new_data)

    # Print the predictions
    print("Predictions:", predictions)

    return accuracy, mse, predictions


# Create and train the logistic regression model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model_lr.predict(X_test)


# Get the feature names
feature_names = X.columns

# Retrieve the feature coefficients
coefficients = model_lr.coef_[0]
abs_coefficients_lr = np.abs(coefficients)

accuracy_lr, mse_lr, predictions_lr = evaluate_model(model_lr, X_test, y_test, new_data)

Accuracy: 0.4397590361445783
MSE: 1.0587349397590362
              precision    recall  f1-score   support

           1       0.36      0.41      0.38        22
           2       0.38      0.31      0.34        68
           3       0.34      0.30      0.32       100
           4       0.40      0.46      0.43       176
           5       0.47      0.45      0.46       170
           6       0.56      0.62      0.59       119
           7       0.50      0.11      0.18         9

    accuracy                           0.44       664
   macro avg       0.43      0.38      0.39       664
weighted avg       0.44      0.44      0.44       664

Predictions: [7]


# Create the SVM model
svm_model = SVC()

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_model.predict(X_test)

# Calculate feature importances using permutation importance
perm_importance = permutation_importance(svm_model, X_test, y_test)

# Get feature importances
feature_importance_svm = perm_importance.importances_mean

accuracy_svm, mse_svm, predictions_svm = evaluate_model(svm_model, X_test, y_test, new_data)

Accuracy: 0.4292168674698795
MSE: 0.9939759036144579
              precision    recall  f1-score   support

           1       0.40      0.27      0.32        22
           2       0.36      0.22      0.27        68
           3       0.31      0.31      0.31       100
           4       0.42      0.54      0.47       176
           5       0.44      0.42      0.43       170
           6       0.56      0.56      0.56       119
           7       1.00      0.00      0.00         9

    accuracy                           0.43       664
   macro avg       0.50      0.33      0.34       664
weighted avg       0.43      0.43      0.42       664

Predictions: [5]


# Create and fit the ordinal regression model
model_ord = LogisticAT()
model_ord.fit(X_train, y_train)

# Get feature importances
feature_importance_ord = perm_importance.importances_mean


accuracy_ordinal, mse_ordinal, predictions_ordinal = evaluate_model(model_ord, X_test, y_test, new_data)

Accuracy: 0.42319277108433734
MSE: 0.9728915662650602
              precision    recall  f1-score   support

           1       0.36      0.18      0.24        22
           2       0.30      0.21      0.24        68
           3       0.30      0.37      0.33       100
           4       0.41      0.49      0.45       176
           5       0.47      0.49      0.48       170
           6       0.61      0.47      0.53       119
           7       0.00      0.00      0.00         9

    accuracy                           0.42       664
   macro avg       0.35      0.32      0.33       664
weighted avg       0.43      0.42      0.42       664

Predictions: [7]


# Train the Gradient Boosting model on the imputed data
model_gradient = GradientBoostingClassifier(loss="log_loss")
model_gradient.fit(X_train, y_train)

# Make predictions on the imputed test data
y_pred = model_gradient.predict(X_test)

# Retrieve the feature importance
feature_importance_gradient = model_gradient.feature_importances_

accuracy_gradient, mse_gradient, predictions_gradient = evaluate_model(model_gradient, X_test, y_test, new_data)

Accuracy: 0.40963855421686746
MSE: 1.1265060240963856
              precision    recall  f1-score   support

           1       0.31      0.41      0.35        22
           2       0.37      0.25      0.30        68
           3       0.33      0.37      0.35       100
           4       0.41      0.42      0.41       176
           5       0.45      0.45      0.45       170
           6       0.50      0.50      0.50       119
           7       0.00      0.00      0.00         9

    accuracy                           0.41       664
   macro avg       0.34      0.34      0.34       664
weighted avg       0.41      0.41      0.41       664

Predictions: [4]


# Initialize the Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse')

# Fit the model to the training data
rf.fit(X_train, y_train)

# Predict the target variable for the test set
y_pred = rf.predict(X_test)

# Convert regression problem to classification problem
y_pred_class = np.rint(y_pred).astype(int)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy_rf)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_rf)

# Predict the target variable for the new data
predictions = rf.predict(new_data)

# Print the predictions
print("Predictions:", predictions)

# Round the predictions to the nearest whole number
rounded_prediction = np.round(predictions).astype(int)

# Print the rounded predictions
print("Rounded Prediction:", rounded_prediction)

# Retrieve the feature importance
feature_importance_rf = rf.feature_importances_

Accuracy: 0.42018072289156627
Mean Squared Error: 0.7907871987951807
Predictions: [3.47]
Rounded Prediction: [3]


# Define the neural network architecture
model_neural = Sequential()
model_neural.add(Dense(32, activation="relu", input_shape=(X_train.shape[1],)))
model_neural.add(Dense(16, activation="relu"))
model_neural.add(Dense(8, activation="relu"))
model_neural.add(Dense(1, activation="relu"))  # Changed activation function to "relu"

# Compile the model
model_neural.compile(loss="mean_squared_error", optimizer="adam")

# Train the model
model_neural.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

# Convert regression problem to classification problem
y_pred_class = np.round(y_pred).astype(int)

# Calculate accuracy
accuracy_neural = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy_neural)

# Evaluate the model
mse_neural = model_neural.evaluate(X_test, y_test)
print("Mean Squared Error:", mse_neural)

# Calculate permutation importances
feature_names_neural = X.columns
importances = {}

for feature in feature_names:
    X_test_permuted = X_test.copy()
    np.random.shuffle(X_test_permuted[:, X.columns.get_loc(feature)])
    y_pred_permuted = model_neural.predict(X_test_permuted)
    permuted_mse = mean_squared_error(y_test, y_pred_permuted)
    importances[feature] = np.abs(mse_neural - permuted_mse)

feature_importance_neural = np.array(list(importances.values()))

# Predict the target variable for the new data
predictions = model_neural.predict(new_data)

# Print the predictions
print("Predictions:", predictions)

Epoch 1/100

83/83 [==============================] - 0s 1ms/step - loss: 6.4874
Epoch 2/100
83/83 [==============================] - 0s 700us/step - loss: 2.4442
Epoch 3/100
83/83 [==============================] - 0s 657us/step - loss: 1.8583
Epoch 4/100
83/83 [==============================] - 0s 704us/step - loss: 1.5521
Epoch 5/100
83/83 [==============================] - 0s 716us/step - loss: 1.3632
Epoch 6/100
83/83 [==============================] - 0s 677us/step - loss: 1.2167
Epoch 7/100
83/83 [==============================] - 0s 661us/step - loss: 1.1172
Epoch 8/100
83/83 [==============================] - 0s 651us/step - loss: 1.0432
Epoch 9/100
83/83 [==============================] - 0s 644us/step - loss: 0.9856
Epoch 10/100
83/83 [==============================] - 0s 641us/step - loss: 0.9405
Epoch 11/100
83/83 [==============================] - 0s 646us/step - loss: 0.9095
Epoch 12/100
83/83 [==============================] - 0s 681us/step - loss: 0.8744
Epoch 13/100
83/83 [==============================] - 0s 703us/step - loss: 0.8546
Epoch 14/100
83/83 [==============================] - 0s 697us/step - loss: 0.8377
Epoch 15/100
83/83 [==============================] - 0s 664us/step - loss: 0.8240
Epoch 16/100
83/83 [==============================] - 0s 638us/step - loss: 0.8180
Epoch 17/100
83/83 [==============================] - 0s 645us/step - loss: 0.8008
Epoch 18/100
83/83 [==============================] - 0s 636us/step - loss: 0.7964
Epoch 19/100
83/83 [==============================] - 0s 666us/step - loss: 0.7851
Epoch 20/100
83/83 [==============================] - 0s 733us/step - loss: 0.7816
Epoch 21/100
83/83 [==============================] - 0s 755us/step - loss: 0.7706
Epoch 22/100
83/83 [==============================] - 0s 698us/step - loss: 0.7671
Epoch 23/100
83/83 [==============================] - 0s 680us/step - loss: 0.7581
Epoch 24/100
83/83 [==============================] - 0s 742us/step - loss: 0.7520
Epoch 25/100
83/83 [==============================] - 0s 668us/step - loss: 0.7501
Epoch 26/100
83/83 [==============================] - 0s 664us/step - loss: 0.7419
Epoch 27/100
83/83 [==============================] - 0s 685us/step - loss: 0.7439
Epoch 28/100
83/83 [==============================] - 0s 696us/step - loss: 0.7335
Epoch 29/100
83/83 [==============================] - 0s 663us/step - loss: 0.7295
Epoch 30/100
83/83 [==============================] - 0s 649us/step - loss: 0.7265
Epoch 31/100
83/83 [==============================] - 0s 666us/step - loss: 0.7252
Epoch 32/100
83/83 [==============================] - 0s 850us/step - loss: 0.7158
Epoch 33/100
83/83 [==============================] - 0s 835us/step - loss: 0.7126
Epoch 34/100
83/83 [==============================] - 0s 727us/step - loss: 0.7142
Epoch 35/100
83/83 [==============================] - 0s 696us/step - loss: 0.7148
Epoch 36/100
83/83 [==============================] - 0s 665us/step - loss: 0.7085
Epoch 37/100
83/83 [==============================] - 0s 652us/step - loss: 0.7050
Epoch 38/100
83/83 [==============================] - 0s 657us/step - loss: 0.7070
Epoch 39/100
83/83 [==============================] - 0s 661us/step - loss: 0.7111
Epoch 40/100
83/83 [==============================] - 0s 680us/step - loss: 0.6934
Epoch 41/100
83/83 [==============================] - 0s 920us/step - loss: 0.7004
Epoch 42/100
83/83 [==============================] - 0s 695us/step - loss: 0.6964
Epoch 43/100
83/83 [==============================] - 0s 668us/step - loss: 0.6914
Epoch 44/100
83/83 [==============================] - 0s 668us/step - loss: 0.6845
Epoch 45/100
83/83 [==============================] - 0s 649us/step - loss: 0.6837
Epoch 46/100
83/83 [==============================] - 0s 693us/step - loss: 0.6782
Epoch 47/100
83/83 [==============================] - 0s 706us/step - loss: 0.6753
Epoch 48/100
83/83 [==============================] - 0s 722us/step - loss: 0.6733
Epoch 49/100
83/83 [==============================] - 0s 630us/step - loss: 0.6723
Epoch 50/100
83/83 [==============================] - 0s 692us/step - loss: 0.6702
Epoch 51/100
83/83 [==============================] - 0s 651us/step - loss: 0.6748
Epoch 52/100
83/83 [==============================] - 0s 640us/step - loss: 0.6686
Epoch 53/100
83/83 [==============================] - 0s 640us/step - loss: 0.6615
Epoch 54/100
83/83 [==============================] - 0s 645us/step - loss: 0.6606
Epoch 55/100
83/83 [==============================] - 0s 638us/step - loss: 0.6655
Epoch 56/100
83/83 [==============================] - 0s 668us/step - loss: 0.6630
Epoch 57/100
83/83 [==============================] - 0s 698us/step - loss: 0.6545
Epoch 58/100
83/83 [==============================] - 0s 707us/step - loss: 0.6567
Epoch 59/100
83/83 [==============================] - 0s 663us/step - loss: 0.6544
Epoch 60/100
83/83 [==============================] - 0s 651us/step - loss: 0.6507
Epoch 61/100
83/83 [==============================] - 0s 757us/step - loss: 0.6482
Epoch 62/100
83/83 [==============================] - 0s 635us/step - loss: 0.6479
Epoch 63/100
83/83 [==============================] - 0s 631us/step - loss: 0.6452
Epoch 64/100
83/83 [==============================] - 0s 610us/step - loss: 0.6406
Epoch 65/100
83/83 [==============================] - 0s 631us/step - loss: 0.6354
Epoch 66/100
83/83 [==============================] - 0s 647us/step - loss: 0.6371
Epoch 67/100
83/83 [==============================] - 0s 679us/step - loss: 0.6402
Epoch 68/100
83/83 [==============================] - 0s 703us/step - loss: 0.6351
Epoch 69/100
83/83 [==============================] - 0s 675us/step - loss: 0.6257
Epoch 70/100
83/83 [==============================] - 0s 693us/step - loss: 0.6321
Epoch 71/100
83/83 [==============================] - 0s 698us/step - loss: 0.6238
Epoch 72/100
83/83 [==============================] - 0s 658us/step - loss: 0.6261
Epoch 73/100
83/83 [==============================] - 0s 659us/step - loss: 0.6201
Epoch 74/100
83/83 [==============================] - 0s 647us/step - loss: 0.6339
Epoch 75/100
83/83 [==============================] - 0s 647us/step - loss: 0.6163
Epoch 76/100
83/83 [==============================] - 0s 644us/step - loss: 0.6166
Epoch 77/100
83/83 [==============================] - 0s 786us/step - loss: 0.6211
Epoch 78/100
83/83 [==============================] - 0s 728us/step - loss: 0.6176
Epoch 79/100
83/83 [==============================] - 0s 736us/step - loss: 0.6067
Epoch 80/100
83/83 [==============================] - 0s 734us/step - loss: 0.6146
Epoch 81/100
83/83 [==============================] - 0s 662us/step - loss: 0.6052
Epoch 82/100
83/83 [==============================] - 0s 789us/step - loss: 0.6036
Epoch 83/100
83/83 [==============================] - 0s 694us/step - loss: 0.6033
Epoch 84/100
83/83 [==============================] - 0s 664us/step - loss: 0.6075
Epoch 85/100
83/83 [==============================] - 0s 681us/step - loss: 0.6000
Epoch 86/100
83/83 [==============================] - 0s 703us/step - loss: 0.6005
Epoch 87/100
83/83 [==============================] - 0s 684us/step - loss: 0.5906
Epoch 88/100
83/83 [==============================] - 0s 697us/step - loss: 0.5925
Epoch 89/100
83/83 [==============================] - 0s 673us/step - loss: 0.5956
Epoch 90/100
83/83 [==============================] - 0s 692us/step - loss: 0.5870
Epoch 91/100
83/83 [==============================] - 0s 695us/step - loss: 0.5880
Epoch 92/100
83/83 [==============================] - 0s 744us/step - loss: 0.5841
Epoch 93/100
83/83 [==============================] - 0s 872us/step - loss: 0.5787
Epoch 94/100
83/83 [==============================] - 0s 699us/step - loss: 0.5812
Epoch 95/100
83/83 [==============================] - 0s 668us/step - loss: 0.5783
Epoch 96/100
83/83 [==============================] - 0s 663us/step - loss: 0.5747
Epoch 97/100
83/83 [==============================] - 0s 657us/step - loss: 0.5715
Epoch 98/100
83/83 [==============================] - 0s 677us/step - loss: 0.5724
Epoch 99/100
83/83 [==============================] - 0s 800us/step - loss: 0.5749
Epoch 100/100
83/83 [==============================] - 0s 697us/step - loss: 0.5749
Accuracy: 0.42018072289156627
21/21 [==============================] - 0s 682us/step - loss: 0.9148
Mean Squared Error: 0.9148346185684204
21/21 [==============================] - 0s 570us/step
21/21 [==============================] - 0s 551us/step
21/21 [==============================] - 0s 562us/step
21/21 [==============================] - 0s 522us/step
21/21 [==============================] - 0s 531us/step
21/21 [==============================] - 0s 530us/step
21/21 [==============================] - 0s 622us/step
21/21 [==============================] - 0s 535us/step
21/21 [==============================] - 0s 521us/step
21/21 [==============================] - 0s 512us/step
21/21 [==============================] - 0s 524us/step
21/21 [==============================] - 0s 493us/step
21/21 [==============================] - 0s 476us/step
21/21 [==============================] - 0s 479us/step
21/21 [==============================] - 0s 533us/step
21/21 [==============================] - 0s 493us/step
21/21 [==============================] - 0s 524us/step
21/21 [==============================] - 0s 518us/step
1/1 [==============================] - 0s 32ms/step
Predictions: [[198.88232]]


# Create a list of model names
models = ['Logistic Regression', 'SVM', 'Ordinal Regression', 'Random Forest','Gradient Boost', 'Neural Network']

# Define colors for the bars
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']


# Create a list of accuracy values
accuracy_values = [accuracy_lr, accuracy_svm, accuracy_ordinal, accuracy_rf, accuracy_gradient, accuracy_neural]

# Plot the accuracy values
plt.figure(figsize=(10, 8))
plt.bar(models, accuracy_values, color=colors)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models')


# Rotate the x-axis labels
plt.xticks(rotation=45)
plt.show()


# Create a list of MSE values
mse_values = [mse_lr, mse_svm, mse_ordinal, mse_rf, mse_gradient, mse_neural]

print(mse_values)

# Plot the MSE values
plt.figure(figsize=(10, 8))
plt.bar(models, mse_values, color=colors )
plt.xlabel('Models')
plt.ylabel('MSE')
plt.title('Mean Squared Error of Different Models')


# Rotate the x-axis labels
plt.xticks(rotation=45)
plt.show()

[1.0587349397590362, 0.9939759036144579, 0.9728915662650602, 0.7907871987951807, 1.1265060240963856, 0.9148346185684204]


def plot_feature_importance(feature_names, importance_values, title):
    # Sort feature importances in descending order
    sorted_indices = np.argsort(importance_values)[::-1]
    sorted_importance = importance_values[sorted_indices]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]

    # Plot the feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_importance)), sorted_importance)
    plt.yticks(range(len(sorted_importance)), sorted_feature_names)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.title(title)
    plt.show()


# Plotting Feature importances of each model
plot_feature_importance(feature_names, abs_coefficients_lr, 'Feature Importance - Logistic Regression')
plot_feature_importance(feature_names, feature_importance_gradient, 'Feature Importance - Gradient Boosting')
plot_feature_importance(feature_names, feature_importance_neural, 'Absolute Feature Importance - Neural Network')
plot_feature_importance(feature_names, feature_importance_ord, 'Permutation Importance - Ordinal Regression')
plot_feature_importance(feature_names, feature_importance_svm, "Permutation Importance - SVM")
plot_feature_importance(feature_names, feature_importance_rf, 'Feature Importance - Random Forest')


# Sort the MSE values and the corresponding models
sorted_indices_mse = np.argsort(mse_values)[::1]  # from smallest to largest
mse_values = np.array(mse_values)[sorted_indices_mse].tolist()
models = np.array(models)[sorted_indices_mse].tolist()

# Sort the feature importance and the corresponding feature names
sorted_indices_rf = np.argsort(feature_importance_rf)[::-1]  # from largest to smallest
feature_importance_rf = np.array(feature_importance_rf)[sorted_indices_rf].tolist()
feature_names = np.array(feature_names)[sorted_indices_rf].tolist()

# Create a grid of subplots with adjusted height
fig, axes = plt.subplots(nrows=2, figsize=(7, 8))  # Adjust the height with figsize

# Change the background color of the figure to black
fig.set_facecolor('black')
for ax in axes:
    ax.tick_params(color='white', labelcolor='white')  # Change tick colors to white
    for spine in ax.spines.values():  # Change the spine colors to white
        spine.set_edgecolor('white')

# Plot the sorted MSE values in the first subplot
axes[0].bar(models, mse_values, color=colors)
axes[0].set_xlabel('Models', color='white')
axes[0].set_ylabel('MSE', color='white')
axes[0].set_title('Mean Squared Error of Different Models (Sorted)', color='white')
axes[0].set_xticks(models)
axes[0].set_xticklabels(models, rotation=45, color='white')

# Plot the sorted feature importance for Random Forest in the second subplot
axes[1].bar(feature_names, feature_importance_rf, color='blue')  # Adjust bar color to white or other suitable color
axes[1].set_title('Feature Importance - Random Forest (Sorted)', color='white')
axes[1].set_xticks(feature_names)
axes[1].set_xticklabels(feature_names, rotation=65, color='white')

# Adjust the space between the plots
plt.subplots_adjust(hspace=0.5)

# Tight layout
plt.tight_layout()

# Save the combined figure
plt.savefig('sorted_mse_and_rf_importance.png', facecolor='black')

# Show the plots
plt.show()


#Showing the most important features of the best model (RF)
# Sort features by importance
feature_indices = np.argsort(np.abs(feature_importance_rf))[::-1]
sorted_features = feature_names[feature_indices]
sorted_importances = feature_importance_rf[feature_indices]

# Display the top 5 most important features
top_features = sorted(zip(sorted_features[:5], sorted_importances[:5]), key=lambda x: x[1], reverse=True)
for feature, importance in top_features:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: ActionLatency, Importance: 0.342695211003904
Feature: APM, Importance: 0.17777432065163282
Feature: TotalHours, Importance: 0.08627322869733454
Feature: SelectByHotkeys, Importance: 0.04825817773456356
Feature: MinimapAttacks, Importance: 0.04490580633337749


# Create and train the estimator 
estimator_askl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=1800,
    seed=42, 
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 3},
    n_jobs=2,
    metric=autosklearn.metrics.root_mean_squared_error,
)
# Auto-sklearn ingests the pandas dataframe and detects column types
estimator_askl.fit(X_train, y_train)

[WARNING] [2023-09-20 13:56:21,291:Client-EnsembleBuilder] No runs were available to build an ensemble from

AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     metric=root_mean_squared_error, n_jobs=2,
                     per_run_time_limit=360, resampling_strategy='cv',
                     resampling_strategy_arguments={'folds': 3}, seed=42,
                     time_left_for_this_task=1800)

AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     metric=root_mean_squared_error, n_jobs=2,
                     per_run_time_limit=360, resampling_strategy='cv',
                     resampling_strategy_arguments={'folds': 3}, seed=42,
                     time_left_for_this_task=1800)


# Score the model
prediction = estimator_askl.predict(X_test)

import sklearn.metrics
performance_askl = sklearn.metrics.mean_squared_error(y_test, prediction, squared=False)
print(f"Auto-Sklearn Classifier MSE performance is {performance_askl}")

# Predict the target variable for the new data
predictions = estimator_askl.predict(new_data)

# Print the predictions
print("Predictions:", predictions)

Auto-Sklearn Classifier MSE performance is 0.8617140431065979

Predictions: [5.27052076]


# Calculate permutation feature importance
result = permutation_importance(estimator_askl, X_test, y_test, n_repeats=10, random_state=42)
feature_importances = np.abs(result.importances_mean)

# Print the feature importance values
for feature_name, importance in zip(feature_names, feature_importances):
    print(f'{feature_name}: {importance}')

Age: 0.0035876301027364054
HoursPerWeek: 0.004615131571352038
TotalHours: 0.1255787742889301
APM: 0.038272441001786325
SelectByHotkeys: 0.022865966417715988
AssignToHotkeys: 0.020554058616885907
UniqueHotkeys: 0.004767339455537589
MinimapAttacks: 0.03276774050069888
MinimapRightClicks: 0.001532571877929123
NumberOfPACs: 0.01859383904123859
GapBetweenPACs: 0.02638265819544936
ActionLatency: 0.06316194850846757
ActionsInPAC: 0.0017945804576440239
TotalMapExplored: 0.015422749692436533
WorkersMade: 0.008921585337062998
UniqueUnitsMade: 0.007326678107794027
ComplexUnitsMade: 0.0007467130158171686
ComplexAbilitiesUsed: 0.0036628883606827146


# Create the boxplot
fig = plt.figure(figsize=(16, 9))
ax = fig.add_subplot(111)
sort_idx = feature_importances.argsort()[::-1]  # Sort indices in descending order
ax.boxplot(result.importances[sort_idx].T, labels=[feature_names[i] for i in sort_idx], vert=False)

# Set font size
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontsize(20)

# Adjust the layout
fig.tight_layout()

# Show the plot
plt.show()

	GameID	LeagueIndex	Age	HoursPerWeek	TotalHours	APM	SelectByHotkeys	AssignToHotkeys	UniqueHotkeys	MinimapAttacks	MinimapRightClicks	NumberOfPACs	GapBetweenPACs	ActionLatency	ActionsInPAC	TotalMapExplored	WorkersMade	UniqueUnitsMade	ComplexAbilitiesUsed
0	52	5	27	10	3000	143.7180	0.003515	0.000220	7	0.000110	0.000392	0.004849	32.6677	40.8673	4.7508	28	0.001397	6	0.000000
1	55	5	23	10	5000	129.2322	0.003304	0.000259	4	0.000294	0.000432	0.004307	32.9194	42.3454	4.8434	22	0.001193	5	0.000208
2	56	4	30	10	200	69.9612	0.001101	0.000336	4	0.000294	0.000461	0.002926	44.6475	75.3548	4.0430	22	0.000745	6	0.000189
3	57	3	19	20	400	107.6016	0.001034	0.000213	1	0.000053	0.000543	0.003783	29.2203	53.7352	4.9155	19	0.000426	7	0.000384
4	58	3	32	10	500	122.8908	0.001136	0.000327	2	0.000000	0.001329	0.002368	22.6885	62.0813	9.3740	15	0.001174	4	0.000019

	GameID	LeagueIndex	Age	HoursPerWeek	TotalHours	APM	SelectByHotkeys	AssignToHotkeys	UniqueHotkeys	MinimapAttacks	MinimapRightClicks	NumberOfPACs	GapBetweenPACs	ActionLatency	ActionsInPAC	TotalMapExplored	WorkersMade	UniqueUnitsMade	ComplexUnitsMade	ComplexAbilitiesUsed
0	52	5	27	10	3000	143.7180	0.003515	0.000220	7	0.000110	0.000392	0.004849	32.6677	40.8673	4.7508	28.0	0.001397	6	0.0	0.000000
1	55	5	23	10	5000	129.2322	0.003304	0.000259	4	0.000294	0.000432	0.004307	32.9194	42.3454	4.8434	22.0	0.001193	5	0.0	0.000208
2	56	4	30	10	200	69.9612	0.001101	0.000336	4	0.000294	0.000461	0.002926	44.6475	75.3548	4.0430	22.0	0.000745	6	0.0	0.000189
3	57	3	19	20	400	107.6016	0.001034	0.000213	1	0.000053	0.000543	0.003783	29.2203	53.7352	4.9155	19.0	0.000426	7	0.0	0.000384
4	58	3	32	10	500	122.8908	0.001136	0.000327	2	0.000000	0.001329	0.002368	22.6885	62.0813	9.3740	15.0	0.001174	4	0.0	0.000019
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3335	9261	4	20	8	400	158.1390	0.013829	0.000504	7	0.000217	0.000313	0.003583	36.3990	66.2718	4.5097	30.0	0.001035	7	0.0	0.000287
3336	9264	5	16	56	1500	186.1320	0.006951	0.000360	6	0.000083	0.000166	0.005414	22.8615	34.7417	4.9309	38.0	0.001343	7	0.0	0.000388
3337	9265	4	21	8	100	121.6992	0.002956	0.000241	8	0.000055	0.000208	0.003690	35.5833	57.9585	5.4154	23.0	0.002014	7	0.0	0.000000
3338	9270	3	20	28	400	134.2848	0.005424	0.000182	5	0.000000	0.000480	0.003205	18.2927	62.4615	6.0202	18.0	0.000934	5	0.0	0.000000
3339	9271	4	22	6	400	88.8246	0.000844	0.000108	2	0.000000	0.000341	0.003099	45.1512	63.4435	5.1913	20.0	0.000476	8	0.0	0.000054

	GameID	LeagueIndex	Age	HoursPerWeek	TotalHours	APM	SelectByHotkeys	AssignToHotkeys	UniqueHotkeys	MinimapAttacks	MinimapRightClicks	NumberOfPACs	GapBetweenPACs	ActionLatency	ActionsInPAC	TotalMapExplored	WorkersMade	UniqueUnitsMade	ComplexUnitsMade	ComplexAbilitiesUsed
count	3338.000000	3338.000000	3338.0	3338.0	3338.0	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000	3338.000000
mean	4719.552127	4.120731	21.650389	15.909527	960.421809	114.575763	0.004023	0.000364	4.316357	0.000094	0.000380	0.003433	40.713819	64.209584	5.266955	22.116836	0.001031	6.541043	0.000060	0.000142
std	2656.919630	1.448170	4.206357	11.964495	17318.133922	48.111912	0.004726	0.000210	2.333322	0.000159	0.000359	0.000966	17.057191	19.037394	1.500605	7.440875	0.000520	1.859049	0.000112	0.000266
min	52.000000	1.000000	16.0	0.0	3.0	22.059600	0.000000	0.000000	0.000000	0.000000	0.000000	0.000679	6.666700	24.632600	2.038900	5.000000	0.000077	2.000000	0.000000	0.000000
25%	2423.250000	3.000000	19.0	8.0	300.0	79.231500	0.001245	0.000202	3.000000	0.000000	0.000139	0.002743	29.326600	50.886425	4.261525	17.000000	0.000682	5.000000	0.000000	0.000000
50%	4788.000000	4.000000	21.0	12.0	500.0	107.070300	0.002445	0.000349	4.000000	0.000039	0.000278	0.003376	37.058900	61.296100	5.087050	22.000000	0.000904	6.000000	0.000000	0.000020
75%	6994.750000	5.000000	24.0	20.0	800.0	140.156100	0.004945	0.000493	6.000000	0.000113	0.000508	0.004003	48.510425	74.032525	6.027350	27.000000	0.001258	8.000000	0.000087	0.000182
max	9271.000000	7.000000	44.0	168.0	1000000.0	389.831400	0.043088	0.001648	10.000000	0.003019	0.003688	0.007971	237.142900	176.372100	18.558100	58.000000	0.005149	13.000000	0.000902	0.003084

	GameID	LeagueIndex	Age	HoursPerWeek	TotalHours	APM	SelectByHotkeys	AssignToHotkeys	UniqueHotkeys	MinimapAttacks	MinimapRightClicks	NumberOfPACs	GapBetweenPACs	ActionLatency	ActionsInPAC	TotalMapExplored	WorkersMade	UniqueUnitsMade	ComplexUnitsMade	ComplexAbilitiesUsed
count	3317.000000	3317.000000	3317.0	3317.0	3317.0	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000	3317.000000
mean	4724.613205	4.117275	21.662044	15.47543	657.836599	114.345070	0.003997	0.000363	4.312632	0.000094	0.000380	0.003430	40.751773	64.252026	5.266317	22.102804	0.001031	6.538137	0.000060	0.000141
std	2658.554774	1.446824	4.201145	10.438505	855.812321	47.855465	0.004683	0.000209	2.328354	0.000159	0.000359	0.000963	17.086203	19.010722	1.502072	7.423148	0.000521	1.856299	0.000111	0.000265
min	52.000000	1.000000	16.0	0.0	3.0	22.059600	0.000000	0.000000	0.000000	0.000000	0.000000	0.000679	6.666700	24.632600	2.038900	5.000000	0.000077	2.000000	0.000000	0.000000
25%	2424.000000	3.000000	19.0	8.0	300.0	79.128000	0.001242	0.000202	3.000000	0.000000	0.000139	0.002743	29.343500	50.961200	4.261500	17.000000	0.000681	5.000000	0.000000	0.000000
50%	4801.000000	4.000000	21.0	12.0	500.0	107.041200	0.002428	0.000348	4.000000	0.000039	0.000279	0.003373	37.094300	61.321400	5.084500	22.000000	0.000904	6.000000	0.000000	0.000020
75%	6997.000000	5.000000	24.0	20.0	800.0	139.926600	0.004907	0.000492	6.000000	0.000112	0.000508	0.003998	48.564300	74.062500	6.027500	27.000000	0.001258	8.000000	0.000087	0.000181
max	9271.000000	7.000000	44.0	60.0	25000.0	389.831400	0.043088	0.001648	10.000000	0.003019	0.003688	0.007971	237.142900	176.372100	18.558100	58.000000	0.005149	13.000000	0.000902	0.003084

StarCraft Player Performance Analytics Project Overview:¶

Objective:¶

Time Constraint:¶

Key Procedures:¶

Tools & Libraries Used:¶

Challenges:¶

Value Delivered:¶

Further Exploration:¶

Justification:¶

Removing the outlier of 168 hours, as playing every hour for the entire week is impossible.¶

Justification:¶

Hypothetical Scenario: Collecting More Data¶

I have created my own correlation map since the one provided by ydata_profiling has a color palette that is difficult to read. Ydata_profiling assisted me in EDA¶

From above graphs, Action Latency, Total hours, and APM were a majority of the models most important features.¶

Conclusion:¶

Based on these findings, I have some recommendations for players:¶

Outside research, I tried implementing this method to my models: https://towardsdatascience.com/simple-trick-to-train-an-ordinal-regression-with-any-classifier-6911183d2a3c ¶

EXTRA: More resource intesive model (Automated optimization)¶

2nd best performing model but gives a decimal prediction like RF¶

StarCraft Player Performance Analytics Project Overview:¶

Objective:¶

Time Constraint:¶

Key Procedures:¶

Tools & Libraries Used:¶

Challenges:¶

Value Delivered:¶

Further Exploration:¶

Justification:¶

Removing the outlier of 168 hours, as playing every hour for the entire week is impossible.¶

Justification:¶

Hypothetical Scenario: Collecting More Data¶

I have created my own correlation map since the one provided by ydata_profiling has a color palette that is difficult to read. Ydata_profiling assisted me in EDA¶

From above graphs, Action Latency, Total hours, and APM were a majority of the models most important features.¶

Conclusion:¶

Based on these findings, I have some recommendations for players:¶

Outside research, I tried implementing this method to my models: https://towardsdatascience.com/simple-trick-to-train-an-ordinal-regression-with-any-classifier-6911183d2a3c¶

EXTRA: More resource intesive model (Automated optimization)¶

2nd best performing model but gives a decimal prediction like RF¶

Outside research, I tried implementing this method to my models: https://towardsdatascience.com/simple-trick-to-train-an-ordinal-regression-with-any-classifier-6911183d2a3c ¶