# List of URLs to scrape data from
urls = [
    'https://www.nhl.com/ducks/roster/2021',
    'https://www.nhl.com/coyotes/roster/2021',
    'https://www.nhl.com/bruins/roster/2021',
    'https://www.nhl.com/sabres/roster/2021',
    'https://www.nhl.com/flames/roster/2021',
    'https://www.nhl.com/hurricanes/roster/2021',
    'https://www.nhl.com/blackhawks/roster/2021',
    'https://www.nhl.com/avalanche/roster/2021',
    'https://www.nhl.com/bluejackets/roster/2021',
    'https://www.nhl.com/stars/roster/2021',
    'https://www.nhl.com/redwings/roster/2021',
    'https://www.nhl.com/oilers/roster/2021',
    'https://www.nhl.com/panthers/roster/2021',
    'https://www.nhl.com/kings/roster/2021',
    'https://www.nhl.com/wild/roster/2021',
    'https://www.nhl.com/canadiens/roster/2021',
    'https://www.nhl.com/predators/roster/2021',
    'https://www.nhl.com/devils/roster/2021',
    'https://www.nhl.com/islanders/roster/2021',
    'https://www.nhl.com/rangers/roster/2021',
    'https://www.nhl.com/senators/roster/2021',
    'https://www.nhl.com/flyers/roster/2021',
    'https://www.nhl.com/penguins/roster/2021',
    'https://www.nhl.com/sharks/roster/2021',
    'https://www.nhl.com/kraken/roster/2021',
    'https://www.nhl.com/blues/roster/2021',
    'https://www.nhl.com/lightning/roster/2021',
    'https://www.nhl.com/mapleleafs/roster/2021',
    'https://www.nhl.com/canucks/roster/2021',
    'https://www.nhl.com/goldenknights/roster/2021',
    'https://www.nhl.com/capitals/roster/2021',
    'https://www.nhl.com/jets/roster/2021'
]


import requests
from bs4 import BeautifulSoup
import pandas as pd
import time 


# Initialize an empty list to store all the data
all_data1 = []

# Loop through each URL and scrape the data
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    # Assuming you want to extract data from table 2 (Forwards)
    table_forwards1 = tables[2]

    data_list_forwards1 = []
    headers_forwards1 = []

    for row in table_forwards1.find_all('tr'):
        if not headers_forwards1:
            headers_forwards1 = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data1 = [data.text.strip() for data in row.find_all('td')]
            data_list_forwards1.append(row_data1)

    # Assuming you want to extract data from table 5 (Defense)
    table_defense1 = tables[5]

    data_list_defense1 = []
    headers_defense1 = [header.text.strip() for header in table_defense1.find_all('th')]

    for row in table_defense1.find_all('tr'):
        if not headers_defense1:
            table_defense1 = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data1 = [data.text.strip() for data in row.find_all('td')]
            data_list_defense1.append(row_data1)

    # Assuming you want to extract data from table 8 (Goalies)
    table_goalies1 = tables[8]

    data_list_goalies1 = []
    headers_goalies1 = []

    for row in table_goalies1.find_all('tr'):
        if not headers_goalies1:
            headers_goalies1 = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data1 = [data.text.strip() for data in row.find_all('td')]
            data_list_goalies1.append(row_data1)

    # Create DataFrames for each table and add them to the 'all_data' list
    df_forwards1 = pd.DataFrame(data_list_forwards1, columns=headers_forwards1)
    df_forwards1['current_team'] = url.split('/')[-3]
    all_data1.append(df_forwards1)

    df_defense1 = pd.DataFrame(data_list_defense1, columns=headers_defense1)
    df_defense1['current_team'] = url.split('/')[-3]
    all_data1.append(df_defense1)

    df_goalies1 = pd.DataFrame(data_list_goalies1, columns=headers_goalies1)
    df_goalies1['current_team'] = url.split('/')[-3]
    all_data1.append(df_goalies1)
    
    # Add a sleep command to avoid overwhelming the server
    time.sleep(20)  # Sleep for 3 seconds before making the next request

# Concatenate all the DataFrames in the 'all_data' list into a single DataFrame
Variables_df_combined1 = pd.concat(all_data1, ignore_index=True)

# Drop rows with None values
Variables_df_combined1 = Variables_df_combined1.dropna()

# Reset the DataFrame index
Variables_df_combined1 = Variables_df_combined1.reset_index(drop=True)


# Display the updated DataFrame
print(Variables_df_combined1)

       # Pos  Sh         Ht   Wt                  Born  \
0     12   C   L  6'1"6' 1"  205  08/10/94Aug 10, 1994   
1     39   C   R  6'0"6' 0"  200   02/04/92Feb 4, 1992   
2     44  LW   L  6'2"6' 2"  210   01/08/99Jan 8, 1999   
3     44  LW   L  6'1"6' 1"  220  02/22/91Feb 22, 1991   
4     15   C   R  6'4"6' 4"  220  05/10/85May 10, 1985   
...   ..  ..  ..        ...  ...                   ...   
1207  54   D   L  6'4"6' 4"  219  01/24/99Jan 24, 1999   
1208  88   D   L  6'0"6' 0"  195  07/16/91Jul 16, 1991   
1209  64   D   L  6'7"6' 7"  242  05/26/98May 26, 1998   
1210  31   G  --  6'1"6' 1"  183   07/06/95Jul 6, 1995   
1211  37   G  --  6'4"6' 4"  216  05/19/93May 19, 1993   

                  Birthplace current_team  
0     Staten Island, NY, USA        ducks  
1           Markham, ON, CAN        ducks  
2         Longueuil, QC, CAN        ducks  
3           LaSalle, QC, CAN        ducks  
4            Regina, SK, CAN        ducks  
...                      ...          ...  
1207        Saginaw, MN, USA         jets  
1208      St. Cloud, MN, USA         jets  
1209       Waterloo, ON, CAN         jets  
1210       Edmonton, AB, CAN         jets  
1211       Commerce, MI, USA         jets  

[1212 rows x 8 columns]


Variables_df_combined1.describe()


# Count the distinct current teams
team_count = Variables_df_combined1['current_team'].nunique()

# Display the count of distinct current teams
print("Number of distinct current teams:", team_count)

Number of distinct current teams: 32


Variables_df_combined1['Ht'] = Variables_df_combined1['Ht'].str.extract(r"(\d+' \d+\")")


# Extract the 'MM/DD/YY' part from the 'Born' column using str.extract
Variables_df_combined1['Born'] = Variables_df_combined1['Born'].str.extract(r'(\d{2}/\d{2}/\d{2})')

Variables_df_combined1['Birthplace'] = Variables_df_combined1['Birthplace'].str.replace(',','_').str.replace(' ','')

# Step 2: Separate 'Birthplace' into 'Location' and 'Nationality'
Variables_df_combined1[['Birthplace', 'Nationality']] = Variables_df_combined1['Birthplace'].str.rsplit('_', n=1, expand=True)

Variables_df_combined1


Variables_df_combined1 = Variables_df_combined1.drop('#', axis=1)
print(Variables_df_combined1)

     Pos  Sh     Ht   Wt      Born       Birthplace current_team Nationality
0      C   L  6' 1"  205  08/10/94  StatenIsland_NY        ducks         USA
1      C   R  6' 0"  200  02/04/92       Markham_ON        ducks         CAN
2     LW   L  6' 2"  210  01/08/99     Longueuil_QC        ducks         CAN
3     LW   L  6' 1"  220  02/22/91       LaSalle_QC        ducks         CAN
4      C   R  6' 4"  220  05/10/85        Regina_SK        ducks         CAN
...   ..  ..    ...  ...       ...              ...          ...         ...
1207   D   L  6' 4"  219  01/24/99       Saginaw_MN         jets         USA
1208   D   L  6' 0"  195  07/16/91      St.Cloud_MN         jets         USA
1209   D   L  6' 7"  242  05/26/98      Waterloo_ON         jets         CAN
1210   G  --  6' 1"  183  07/06/95      Edmonton_AB         jets         CAN
1211   G  --  6' 4"  216  05/19/93      Commerce_MI         jets         USA

[1212 rows x 8 columns]


Variables_df_combined1.to_csv('NHL_Player_Variables_2021.csv',index=False)


def scrape_data_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table')

    # Assuming you want to extract data from table 1 (Forwards)
    table_forwards = tables[1]

    data_list_forwards = []
    headers_forwards = []

    for row in table_forwards.find_all('tr'):
        if not headers_forwards:
            headers_forwards = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data = [data.text.strip() for data in row.find_all('td')]
            data_list_forwards.append(row_data)

    # Assuming you want to extract data from table 4 (Defense)
    table_defense = tables[4]

    data_list_defense = []
    headers_defense = []

    for row in table_defense.find_all('tr'):
        if not headers_defense:
            headers_defense = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data = [data.text.strip() for data in row.find_all('td')]
            data_list_defense.append(row_data)

    # Assuming you want to extract data from table 7 (Goalies)
    table_goalies = tables[7]

    data_list_goalies = []
    headers_goalies = []

    for row in table_goalies.find_all('tr'):
        if not headers_goalies:
            headers_goalies = [header.text.strip() for header in row.find_all('th')]
        else:
            row_data = [data.text.strip() for data in row.find_all('td')]
            data_list_goalies.append(row_data)

    # Create DataFrames for each table
    df_forwards = pd.DataFrame(data_list_forwards, columns=headers_forwards)
    df_defense = pd.DataFrame(data_list_defense, columns=headers_defense)
    df_goalies = pd.DataFrame(data_list_goalies, columns=headers_goalies)

    # Concatenate the DataFrames
    df_combined = pd.concat([df_forwards, df_defense, df_goalies], ignore_index=True)

    # Remove any row with all None values (empty rows)
    df_combined.dropna(how='all', inplace=True)
    
    # Reset the DataFrame index
    df_combined = df_combined.reset_index(drop=True)

    # Clean the 'Player' column
    df_combined['Player'] = df_combined['Player'].str.replace(r'\n', ' ', regex=True).str.strip()
    df_combined['Player'] = df_combined['Player'].apply(lambda x: x.replace('(A)', '').strip())
    df_combined['Player'] = df_combined['Player'].apply(lambda x: x.replace('(C)', '').strip())
    df_combined['Player'] = df_combined['Player'].apply(lambda x: x.replace('*', '').strip())
    df_combined['Player'] = df_combined['Player'].apply(lambda x: ' '.join(x.split()[1:]))

    
    return df_combined


# Initialize an empty list to store all the DataFrames
all_dataframes = []

# Loop through each URL and scrape the data
for url in urls:
    df_combined = scrape_data_from_url(url)
    all_dataframes.append(df_combined)

    # Add a sleep command to avoid overwhelming the server
    time.sleep(3)  # Sleep for 3 seconds before making the next request

# Concatenate all the DataFrames into a single DataFrame
df_combined = pd.concat(all_dataframes, ignore_index=True)

# Display the combined DataFrame
print(df_combined)

                   Player
0        Zach Aston-Reese
1             Sam Carrick
2             Max Comtois
3     Nicolas Deslauriers
4            Ryan Getzlaf
...                   ...
1207        Dylan Samberg
1208         Nate Schmidt
1209        Logan Stanley
1210          Eric Comrie
1211    Connor Hellebuyck

[1212 rows x 1 columns]


df_combined.describe()


df_combined.to_csv('NHL_2021_Player_Names.csv',index=False)


from datetime import datetime

def get_draft_data(urls):
    all_dataframes = []
    current_year = datetime.now().year
    
    for url in urls:
        # Send an HTTP request to the URL
        response = requests.get(url)

        # Check the status code of the response
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, "html.parser")

            # Find the remaining table (assuming there is only one table left) and extract its data into a list
            table = soup.find('table')

            # Check if the table is found before proceeding with data extraction
            if table:
                table_data = []
                for row in table.find_all('tr')[1:]:
                    cols = row.find_all(['th', 'td'])
                    cols = [col.text.strip() for col in cols]
                    table_data.append(cols)

                # Create a DataFrame from the extracted data
                columns = table_data[0]
                data = table_data[1:]
                df = pd.DataFrame(data, columns=columns)

                # Find the index of the rows with the repeated column headers
                repeated_headers_index = df[df['Year'].str.contains('Year')].index

                # Drop the rows with the repeated column headers
                df = df.drop(repeated_headers_index)

                # Reset the index of the DataFrame
                df.reset_index(drop=True, inplace=True)

                # Drop the columns that are unneeded
                columns_to_drop = ['Draft', 'Lg']
                df.drop(columns=columns_to_drop, inplace=True)

                # Add the 'Current_Team' column and set its value to the team name extracted from the URL
                Drafted_Team = url.split("/")[-2]
                df['Drafted_Team'] = Drafted_Team
                
                # Add the 'Year' column as an integer
                df['Year'] = df['Year'].astype(int)
                
                # Filter out rows with 'Year' values 26 or more years ago from the current year (BIG JOE)
                df = df[df['Year'] >= current_year - 26]

                # Append the DataFrame to the list
                all_dataframes.append(df)

                # Sleep for 5 seconds before accessing the next URL
                time.sleep(5)
        else:
            print("Failed to get the webpage:", url)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df

# Define the URLs of the webpages containing the tables
urls = [
    "https://www.hockey-reference.com/teams/ANA/draft.html",
    "https://www.hockey-reference.com/teams/PHX/draft.html",
    "https://www.hockey-reference.com/teams/BOS/draft.html",
    "https://www.hockey-reference.com/teams/BUF/draft.html",
    "https://www.hockey-reference.com/teams/CGY/draft.html",
    "https://www.hockey-reference.com/teams/CAR/draft.html",
    "https://www.hockey-reference.com/teams/CHI/draft.html",
    "https://www.hockey-reference.com/teams/COL/draft.html",
    "https://www.hockey-reference.com/teams/CBJ/draft.html",
    "https://www.hockey-reference.com/teams/DAL/draft.html",
    "https://www.hockey-reference.com/teams/DET/draft.html",
    "https://www.hockey-reference.com/teams/EDM/draft.html",
    "https://www.hockey-reference.com/teams/FLA/draft.html",
    "https://www.hockey-reference.com/teams/LAK/draft.html",
    "https://www.hockey-reference.com/teams/MIN/draft.html",
    "https://www.hockey-reference.com/teams/MTL/draft.html",
    "https://www.hockey-reference.com/teams/NSH/draft.html",
    "https://www.hockey-reference.com/teams/NJD/draft.html",
    "https://www.hockey-reference.com/teams/NYI/draft.html",
    "https://www.hockey-reference.com/teams/NYR/draft.html",
    "https://www.hockey-reference.com/teams/OTT/draft.html",
    "https://www.hockey-reference.com/teams/PHI/draft.html",
    "https://www.hockey-reference.com/teams/PIT/draft.html",
    "https://www.hockey-reference.com/teams/SJS/draft.html",
    "https://www.hockey-reference.com/teams/SEA/draft.html",
    "https://www.hockey-reference.com/teams/STL/draft.html",
    "https://www.hockey-reference.com/teams/TBL/draft.html",
    "https://www.hockey-reference.com/teams/TOR/draft.html",
    "https://www.hockey-reference.com/teams/VAN/draft.html",
    "https://www.hockey-reference.com/teams/VEG/draft.html",
    "https://www.hockey-reference.com/teams/WSH/draft.html",
    "https://www.hockey-reference.com/teams/WPG/draft.html"
]

# Get the DataFrame with the extracted and combined data from all URLs
df_combined = get_draft_data(urls)

# Display the combined DataFrame
print(df_combined)

      Year Rd. Overall           Player Pos                    Amateur Team  \
0     2023   1       2     Leo Carlsson   C                 Orebro (Sweden)   
1     2023   2      33    Nico Myatovic  LW      Seattle Thunderbirds (WHL)   
2     2023   2      59   Carey Terrance   C               Erie Otters (OHL)   
3     2023   2      60     Damian Clara   G      Farjestad Jr. (Sweden-Jr.)   
4     2023   3      65    Coulson Pitre  RW           Flint Firebirds (OHL)   
...    ...  ..     ...              ...  ..                             ...   
6316  1999   6     159  Yuri Dobryshkin  RW  Krylia Sovetov Moskva (Russia)   
6317  1999   7     188     Stephen Baby  RW       Green Bay Gamblers (USHL)   
6318  1999   8     217    Garnet Exelby   D          Saskatoon Blades (WHL)   
6319  1999   9     245    Tommi Santala   C               Jokerit (Finland)   
6320  1999   9     246  Raymond DiLauro   D             St. Lawrence (ECAC)   

       GP  G   A PTS  +/-  PIM GP W  L  T/O SV% GAA Drafted_Team  
0                                                            ANA  
1                                                            ANA  
2                                                            ANA  
3                                                            ANA  
4                                                            ANA  
...   ... ..  ..  ..  ...  ... .. .. ..  ..  ..  ..          ...  
6316                                                         WPG  
6317                                                         WPG  
6318  408  7  43  50  -28  584                               WPG  
6319   63  2   7   9   -7   46                               WPG  
6320                                                         WPG  

[6321 rows x 19 columns]


# List of all the drafted teams from the URLs
all_teams = [
    "ANA", "PHX", "BOS", "BUF", "CGY", "CAR", "CHI", "COL", "CBJ", "DAL", "DET", "EDM", "FLA", "LAK",
    "MIN", "MTL", "NSH", "NJD", "NYI", "NYR", "OTT", "PHI", "PIT", "SJS", "SEA", "STL", "TBL", "TOR",
    "VAN", "VEG", "WSH", "WPG"
]

# Get the unique drafted teams from the DataFrame
unique_teams_in_df = df_combined['Drafted_Team'].unique()

# Find the missing drafted teams
missing_teams = [team for team in all_teams if team not in unique_teams_in_df]

# Display the missing teams
print("Missing drafted teams:", missing_teams)

Missing drafted teams: []


minimum_year = df_combined['Year'].min()
print(minimum_year)

1997


# Drop the specified columns from the DataFrame
columns_to_drop = ['GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'W', 'L', 'T/O', 'SV%', 'GAA']
df_combined.drop(columns=columns_to_drop, inplace=True)


# Save the combined DataFrame to a CSV file named "draft_data.csv"
df_combined.to_csv("NHL_draft_2023_data.csv", index=False)


# Filter rows where the player name is "Maxime"
maxime_df = df_combined[df_combined["Player"] == "Joe Thornton"]

# Display the DataFrame containing rows where the player name is "Maxime"
print(maxime_df)

     Year Rd. Overall        Player Pos          Amateur Team Drafted_Team
591  1997   1       1  Joe Thornton   C  Soo Greyhounds (OHL)          BOS

NHL Draft Player Analysis Project Overview:¶

Objective:¶

Background:¶

Data Collection & Cleaning:¶

Key Code Excerpts:¶

Insights Gained:¶

MYSQL CODE

Conclusion:¶

For a young hockey talent, these are my recommendations:¶

	#	Pos	Sh	Ht	Wt	Born	Birthplace	current_team
count	1212	1212	1212	1212	1212	1212	1212	1212
unique	97	5	3	16	88	987	615	32
top	17	D	L	6'2"6' 2"	190	08/14/92Aug 14, 1992	Toronto, ON, CAN	canadiens
freq	29	372	678	221	71	4	33	46

	#	Pos	Sh	Ht	Wt	Born	Birthplace	current_team	Nationality
0	12	C	L	6' 1"	205	08/10/94	StatenIsland_NY	ducks	USA
1	39	C	R	6' 0"	200	02/04/92	Markham_ON	ducks	CAN
2	44	LW	L	6' 2"	210	01/08/99	Longueuil_QC	ducks	CAN
3	44	LW	L	6' 1"	220	02/22/91	LaSalle_QC	ducks	CAN
4	15	C	R	6' 4"	220	05/10/85	Regina_SK	ducks	CAN
...	...	...	...	...	...	...	...	...	...
1207	54	D	L	6' 4"	219	01/24/99	Saginaw_MN	jets	USA
1208	88	D	L	6' 0"	195	07/16/91	St.Cloud_MN	jets	USA
1209	64	D	L	6' 7"	242	05/26/98	Waterloo_ON	jets	CAN
1210	31	G	--	6' 1"	183	07/06/95	Edmonton_AB	jets	CAN
1211	37	G	--	6' 4"	216	05/19/93	Commerce_MI	jets	USA