import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#reading the file
df = pd.read_csv("players and stats per minute played 2019-2023.csv")
df

# Set the index to stop at
index_to_stop_at = 665

# Keep only rows before (and including) the specified index
df= df.iloc[:index_to_stop_at]
df

df = df.fillna(0)
df = df.drop_duplicates(keep = "first")
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})')
df['Year'] = df['Year'].astype(int)
df.head()

selected_stat = "Pt_min"  # Replace with the stat you want to visualize
selected_year = 2019     # Replace with the year you want to analyze
min_games = 10           # Replace with the minimum number of games a team must have
# Filter data for the chosen year
year_df = df[df['Year'] == selected_year]

# Calculate the maximum number of games played by any player for each team
max_games_per_team = year_df.groupby('Team')['Games'].max()

# Filter teams that have at least 'min_games' games based on the max games played by any player in the team
filtered_teams = max_games_per_team[max_games_per_team >= min_games].index

# Filter the dataframe for only those teams
filtered_year_df = year_df[year_df['Team'].isin(filtered_teams)]

# Compute the overall average for the selected stat across the filtered teams
yearly_avg = filtered_year_df[selected_stat].mean()

# 📊 Create Boxplot for the Selected Stat and Year for All Filtered Teams with Average Line
plt.figure(figsize=(15, 8))

# Generate a light color palette for the teams (using Set2 palette)
team_colors = sns.color_palette("Set2", n_colors=len(filtered_teams))

# Create the boxplot
sns.boxplot(data=filtered_year_df, x="Team", y=selected_stat, showfliers=True, hue="Team", palette=team_colors)

# Overlay the yearly average as a horizontal red dashed line
plt.axhline(y=yearly_avg, color="red", linestyle="--", label="Yearly Avg", alpha=0.8)

# 📊 Identifying and labeling upper outliers **for each team individually**
def identify_outliers_by_team(df, stat):
    """ Function to identify upper outliers for each team based on IQR method """
    outliers = []
    for team, group in df.groupby("Team"):
        Q1 = group[stat].quantile(0.25)
        Q3 = group[stat].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 1.5 * IQR
        
        # Find upper outliers for the team
        team_outliers = group[group[stat] > upper_bound]
        for _, row in team_outliers.iterrows():
            outliers.append((team, row['Player'], row[stat]))
    
    return outliers

# Get the upper outliers for each team
team_outliers = identify_outliers_by_team(filtered_year_df, selected_stat)

# 📊 Identifying and labeling **global outliers** for all teams
def identify_global_outliers(df, stat):
    """ Function to identify global outliers based on IQR method (using the entire dataset's IQR) """
    Q1 = df[stat].quantile(0.25)
    Q3 = df[stat].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    
    # Find global upper outliers (for all teams combined)
    global_outliers = df[df[stat] > upper_bound]
    
    return global_outliers

# Get global upper outliers
global_outliers = identify_global_outliers(filtered_year_df, selected_stat)

# Combine team and global outliers
combined_outliers = team_outliers + [(row['Team'], row['Player'], row[selected_stat]) for _, row in global_outliers.iterrows()]


# Add text labels to the combined outliers (horizontal labels)
for team, player, value in combined_outliers:
    plt.text(x=team, y=value, s=f"{player}: {value:.2f}",
             color='black', ha='center', va='bottom', fontsize=10, rotation=0, fontweight='bold')  # rotation=0 for horizontal text

# Title, labels, and formatting
plt.title(f"Boxplot for {selected_stat} in {selected_year} for Teams with at Least {min_games} Games Played by Any Player", fontsize=14)
plt.xlabel("Teams")
plt.ylabel(f"{selected_stat} Value")
plt.xticks(rotation=45)  # Rotate team names if needed

# Add legend for the average line
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Settings for the year and minimum games to filter
selected_year = 2021  # Example: choose year
min_games = 8  # Example: minimum games played by any player in the team

# List of stats to visualize (add more stats to this list)
stats_list = ['Pt_min','Ast_min','ORB_min','DRB_min','Stl_min']  # Example stats to plot

# Filter the data for the selected year
year_df = df[df['Year'] == selected_year]

# Calculate the maximum number of games played by any player for each team
max_games_per_team = year_df.groupby('Team')['Games'].max()

# Filter teams that have at least 'min_games' games played by any player in the team
filtered_teams = max_games_per_team[max_games_per_team >= min_games].index

# Filter the dataframe for teams that meet the min_games requirement
filtered_year_df = year_df[year_df['Team'].isin(filtered_teams)]

# Loop through each stat and create a separate boxplot
for selected_stat in stats_list:
    # Compute the overall average for the selected stat across the filtered teams
    yearly_avg = filtered_year_df[selected_stat].mean()

    # Create Boxplot for the Selected Stat and Year for All Filtered Teams with Average Line
    plt.figure(figsize=(10, 6))

    # Generate a light color palette for the teams (using Set2 palette)
    team_colors = sns.color_palette("Set2", n_colors=len(filtered_teams))

    # Create the boxplot
    sns.boxplot(data=filtered_year_df, x="Team", y=selected_stat, showfliers=True, hue="Team", palette=team_colors)

    # Overlay the yearly average as a horizontal red dashed line
    plt.axhline(y=yearly_avg, color="red", linestyle="--", label="Yearly Avg", alpha=0.8)

    # Identifying and labeling upper outliers **for each team individually**
    def identify_outliers_by_team(df, stat):
        """ Function to identify upper outliers for each team based on IQR method """
        outliers = []
        for team, group in df.groupby("Team"):
            Q1 = group[stat].quantile(0.25)
            Q3 = group[stat].quantile(0.75)
            IQR = Q3 - Q1
            upper_bound = Q3 + 1.5 * IQR

            # Find upper outliers for the team
            team_outliers = group[group[stat] > upper_bound]
            for _, row in team_outliers.iterrows():
                outliers.append((team, row['Player'], row[stat]))

        return outliers

    # Get the upper outliers for each team
    team_outliers = identify_outliers_by_team(filtered_year_df, selected_stat)

    # Identifying and labeling **global outliers** for all teams
    def identify_global_outliers(df, stat):
        """ Function to identify global outliers based on IQR method (using the entire dataset's IQR) """
        Q1 = df[stat].quantile(0.25)
        Q3 = df[stat].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 1.5 * IQR

        # Find global upper outliers (for all teams combined)
        global_outliers = df[df[stat] > upper_bound]

        return global_outliers

    # Get global upper outliers
    global_outliers = identify_global_outliers(filtered_year_df, selected_stat)

    # Combine team and global outliers
    combined_outliers = team_outliers + [(row['Team'], row['Player'], row[selected_stat]) for _, row in global_outliers.iterrows()]
    
    
    # Add text labels to the combined outliers with dynamic offset
    for team, player, value in combined_outliers:
        
        plt.text(x=team, y=value, s=f"{player}: {value:.2f}",
                 color='black', ha='center', va='bottom', fontsize=8, rotation=0, fontweight='normal')

    # Title, labels, and formatting
    plt.title(f"Boxplot for {selected_stat} in {selected_year} with at Least {min_games} Games Played by Any Player", fontsize=14)
    plt.xlabel("Teams")
    plt.ylabel(f"{selected_stat} Value")
    plt.xticks(rotation=45)  # Rotate team names if needed

    # Add legend for the average line
    plt.legend()

    # Display the plot
    plt.tight_layout()
    plt.show()

# Select a specific team and year
team_name = "GSW"  # Change this to the team you want
selected_year = 2019   # Change this to the year you want

# Filter data for the chosen team and year
team_df = df[(df['Team'] == team_name) & (df['Year'] == selected_year)]

# Select key stats to analyze (replace with actual column names)
selected_stats = ["Pt_min", "Ast_min", "ORB_min", "DRB_min", "Stl_min", "BlK_min"]

# Compute the overall yearly average for the selected year
yearly_avg = df[df['Year'] == selected_year][selected_stats].mean()

# 📊 Create Boxplots for the Chosen Year
plt.figure(figsize=(15, 6))

for i, stat in enumerate(selected_stats, 1):
    plt.subplot(1, len(selected_stats), i)  # Create subplots
    
    sns.boxplot(data=team_df, y=stat, showfliers=False)  # Boxplot for player stats in that year
    
    # Overlay the yearly average as a horizontal red dashed line
    plt.axhline(y=yearly_avg[stat], color="red", linestyle="--", alpha=0.8, label="Year Avg" if i == 1 else "")
    
    plt.title(stat)  # Set title per stat
    plt.xlabel("Players")
    plt.xticks([])  # Hide x-axis labels since we focus on boxplots
    plt.ylabel("Value")

# Add legend only once
plt.legend()
plt.suptitle(f"Performance Comparison for {team_name} in {selected_year} vs Yearly Average", fontsize=14)
plt.tight_layout()
plt.show()

C:\Users\johna\AppData\Local\Temp\ipykernel_1076\2169416895.py:31: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()

# Select two teams and a year
team_1 = "GSW"  # Change to the first team you want
team_2 = "TOR"  # Change to the second team you want
selected_year = 2019   # Change this to the year you want

# Filter data for the chosen teams and year
teams_df = df[(df['Team'].isin([team_1, team_2])) & (df['Year'] == selected_year)]

# Select key stats to analyze (replace with actual column names)
selected_stats = ["Pt_min", "Ast_min", "ORB_min", "DRB_min", "Stl_min", "BlK_min"]  

# Compute the overall yearly average for the selected year
yearly_avg = df[df['Year'] == selected_year][selected_stats].mean()

team_colors = {team_1: "lightblue", team_2: "lightcoral"}  # Light blue for Team 1, light red for Team 2

# 📊 Create Boxplots for the Chosen Year and Two Teams with Light Colors
plt.figure(figsize=(15, 6))

for i, stat in enumerate(selected_stats, 1):
    plt.subplot(1, len(selected_stats), i)  # Create subplots
    
    # Use hue to differentiate teams and pass palette for custom colors
    sns.boxplot(data=teams_df, x="Team", y=stat, showfliers=False, hue="Team", palette=team_colors, legend=False)
    
    # Overlay the yearly average as a horizontal red dashed line
    plt.axhline(y=yearly_avg[stat], color="red", linestyle="--", alpha=0.8, label="Yearly Avg")
    
    plt.title(stat)  # Set title per stat
    plt.xlabel("Teams")
    plt.ylabel("Value")

# Add legend only once
plt.legend()
plt.suptitle(f"Performance Comparison for {team_1} and {team_2} in {selected_year} vs Yearly Average", fontsize=14)
plt.tight_layout()
plt.show()

# Connect to a SQLite database (creates the file if it doesn’t exist)
conn = sqlite3.connect("my_database.db")

# Create a cursor object to interact with the database
cursor = conn.cursor()
# Insert the DataFrame into the SQLite database
df.to_sql("players_stats", conn, if_exists="replace", index=False)

# Verify by querying the table and displaying the first few rows
df_sql = pd.read_sql("SELECT * FROM players_stats", conn)
df_sql.head()

query = """
SELECT Year, AVG(Pt_min), AVG(Ast_min), AVG(ORB_min), AVG(DRB_min), AVG(Blk_min), AVG(Stl_min)
FROM players_stats
GROUP BY Year
"""
query1 = """
SELECT *
FROM players_stats
WHERE (Year = 2019 and Team  = 'TOR')
"""

filtered= pd.read_sql(query, conn)
filtered1= pd.read_sql(query1, conn)
filtered

GSW_2019 = filtered[["Player", "Pt_min", "Ast_min", "ORB_min", "DRB_min"]]
TOR_2019 = filtered1[["Player", "Pt_min", "Ast_min"]]

sns.boxplot(GSW_2019)

<Axes: >

plt.figure(figsize=(15, 6))
plt.plot(filtered["Player"],filtered["Points"])
plt.plot(filtered["Player"],filtered["AST"])
plt.xlabel("Players")
plt.ylabel("Stats per Minute")
plt.legend()
plt.show()

C:\Users\johna\AppData\Local\Temp\ipykernel_1076\93248503.py:6: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()

	Player	Team	Year	min	Games	Pt_min	PER	Ast_min	Ast_pct	ORB_min	...	Usage_pct	PF_min	FT_min	FT_pct	_2pt_min	_2pt_pct	_3pt_min	_3pt_pct	eFG_pct	BPM
0	Jayson Tatum	BOS	2019.0	295	9.0	0.464407	13.4	0.057627	10.0	0.023729	...	21.9	0.071186	0.098305	0.744	0.132203	0.481	0.033898	0.323	0.482	1.0
1	Jaylen Brown	BOS	2019.0	274	9.0	0.456204	13.1	0.036496	6.3	0.021898	...	18.4	0.109489	0.083942	0.767	0.109489	0.638	0.051095	0.350	0.586	0.7
2	Al Horford	BOS	2019.0	310	9.0	0.403226	14.2	0.129032	21.7	0.041935	...	19.4	0.067742	0.048387	0.833	0.090323	0.424	0.058065	0.409	0.500	2.8
3	Terry Rozier	BOS	2019.0	162	9.0	0.358025	8.7	0.104938	16.8	0.024691	...	20.6	0.092593	0.074074	0.750	0.067901	0.440	0.049383	0.235	0.390	-1.5
4	Aron Baynes	BOS	2019.0	115	9.0	0.165217	6.7	0.026087	3.9	0.052174	...	7.6	0.173913	0.008696	0.500	0.052174	0.750	0.017391	0.333	0.643	-2.3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
675	NaN	NaN	NaN	NaN	2019.0	0.486486	NaN	0.122172	NaN	0.057508	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
676	NaN	NaN	NaN	NaN	2020.0	0.526344	NaN	0.120766	NaN	0.051591	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
677	NaN	NaN	NaN	NaN	2021.0	0.523412	NaN	0.111111	NaN	0.053954	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
678	NaN	NaN	NaN	NaN	2022.0	0.507865	NaN	0.126866	NaN	0.054144	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
679	NaN	NaN	NaN	NaN	2023.0	0.507555	NaN	0.130777	NaN	0.055721	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Player	Team	Year	min	Games	Pt_min	PER	Ast_min	Ast_pct	ORB_min	...	Usage_pct	PF_min	FT_min	FT_pct	_2pt_min	_2pt_pct	_3pt_min	_3pt_pct	eFG_pct	BPM
0	Jayson Tatum	BOS	2019.0	295	9.0	0.464407	13.4	0.057627	10.0	0.023729	...	21.9	0.071186	0.098305	0.744	0.132203	0.481	0.033898	0.323	0.482	1.0
1	Jaylen Brown	BOS	2019.0	274	9.0	0.456204	13.1	0.036496	6.3	0.021898	...	18.4	0.109489	0.083942	0.767	0.109489	0.638	0.051095	0.350	0.586	0.7
2	Al Horford	BOS	2019.0	310	9.0	0.403226	14.2	0.129032	21.7	0.041935	...	19.4	0.067742	0.048387	0.833	0.090323	0.424	0.058065	0.409	0.500	2.8
3	Terry Rozier	BOS	2019.0	162	9.0	0.358025	8.7	0.104938	16.8	0.024691	...	20.6	0.092593	0.074074	0.750	0.067901	0.440	0.049383	0.235	0.390	-1.5
4	Aron Baynes	BOS	2019.0	115	9.0	0.165217	6.7	0.026087	3.9	0.052174	...	7.6	0.173913	0.008696	0.500	0.052174	0.750	0.017391	0.333	0.643	-2.3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
660	Malik Monk	SAC	2023.0	205	7.0	0.648780	18.3	0.121951	18.2	0.029268	...	24.9	0.073171	0.214634	0.898	0.121951	0.463	0.063415	0.333	0.478	3.3
661	Terence Davis	SAC	2023.0	58	4.0	0.413793	10.2	0.103448	14.4	0.034483	...	15.6	0.206897	0.034483	1.000	0.034483	0.667	0.103448	0.353	0.550	0.7
662	Davion Mitchell	SAC	2023.0	140	7.0	0.357143	9.6	0.085714	11.9	0.021429	...	15.5	0.050000	0.035714	0.833	0.085714	0.632	0.050000	0.259	0.489	-0.9
663	Harrison Barnes	SAC	2023.0	196	7.0	0.382653	12.0	0.025510	3.5	0.051020	...	15.1	0.025510	0.096939	0.731	0.096939	0.543	0.030612	0.240	0.467	0.0
664	Keegan Murray	SAC	2023.0	194	7.0	0.350515	11.4	0.025773	3.6	0.077320	...	13.2	0.092784	0.020619	0.667	0.072165	0.538	0.061856	0.375	0.552	0.0

	Player	Team	Year	min	Games	Pt_min	PER	Ast_min	Ast_pct	ORB_min	...	Usage_pct	PF_min	FT_min	FT_pct	_2pt_min	_2pt_pct	_3pt_min	_3pt_pct	eFG_pct	BPM
0	Jayson Tatum	BOS	2019	295	9.0	0.464407	13.4	0.057627	10.0	0.023729	...	21.9	0.071186	0.098305	0.744	0.132203	0.481	0.033898	0.323	0.482	1.0
1	Jaylen Brown	BOS	2019	274	9.0	0.456204	13.1	0.036496	6.3	0.021898	...	18.4	0.109489	0.083942	0.767	0.109489	0.638	0.051095	0.350	0.586	0.7
2	Al Horford	BOS	2019	310	9.0	0.403226	14.2	0.129032	21.7	0.041935	...	19.4	0.067742	0.048387	0.833	0.090323	0.424	0.058065	0.409	0.500	2.8
3	Terry Rozier	BOS	2019	162	9.0	0.358025	8.7	0.104938	16.8	0.024691	...	20.6	0.092593	0.074074	0.750	0.067901	0.440	0.049383	0.235	0.390	-1.5
4	Aron Baynes	BOS	2019	115	9.0	0.165217	6.7	0.026087	3.9	0.052174	...	7.6	0.173913	0.008696	0.500	0.052174	0.750	0.017391	0.333	0.643	-2.3

	Player	Team	Year	min	Games	Pt_min	PER	Ast_min	Ast_pct	ORB_min	...	Usage_pct	PF_min	FT_min	FT_pct	_2pt_min	_2pt_pct	_3pt_min	_3pt_pct	eFG_pct	BPM
0	Jayson Tatum	BOS	2019.0	295	9.0	0.464407	13.4	0.057627	10.0	0.023729	...	21.9	0.071186	0.098305	0.744	0.132203	0.481	0.033898	0.323	0.482	1.0
1	Jaylen Brown	BOS	2019.0	274	9.0	0.456204	13.1	0.036496	6.3	0.021898	...	18.4	0.109489	0.083942	0.767	0.109489	0.638	0.051095	0.350	0.586	0.7
2	Al Horford	BOS	2019.0	310	9.0	0.403226	14.2	0.129032	21.7	0.041935	...	19.4	0.067742	0.048387	0.833	0.090323	0.424	0.058065	0.409	0.500	2.8
3	Terry Rozier	BOS	2019.0	162	9.0	0.358025	8.7	0.104938	16.8	0.024691	...	20.6	0.092593	0.074074	0.750	0.067901	0.440	0.049383	0.235	0.390	-1.5
4	Aron Baynes	BOS	2019.0	115	9.0	0.165217	6.7	0.026087	3.9	0.052174	...	7.6	0.173913	0.008696	0.500	0.052174	0.750	0.017391	0.333	0.643	-2.3

	Year	AVG(Pt_min)	AVG(Ast_min)	AVG(ORB_min)	AVG(DRB_min)	AVG(Blk_min)	AVG(Stl_min)
0	2019.0	0.422315	0.088460	0.042063	0.141388	0.019748	0.028075
1	2020.0	0.434706	0.090333	0.035395	0.142071	0.016676	0.029306
2	2021.0	0.446172	0.085521	0.043469	0.136436	0.018399	0.026607
3	2022.0	0.418911	0.089734	0.039921	0.130567	0.018455	0.029328
4	2023.0	0.428050	0.090734	0.044986	0.130359	0.018788	0.028316

NBA Champions' DNA Analysis¶

Motivation¶

Methodology¶

Project Objective¶

Reading the data¶

This dataset shows the stats for every player during a playoff run from 2019 to 2023. This dataset can be found in basketballreference.com¶

Data Cleaning¶

After close examination to the DataSet, its possible to notice that after the row #665 there is no more information about players, there is only a statistic summary of the data. Then, those rows will be dropped.¶

One specific thing about this dataset is that some Players that are Centers, did not attempted a 3-point shot, making this entry NaN.¶

To keep the analysis fair and insightfull, those values will be replaced to 0. to guarantee that no miss-droping will happen.¶

Loading the pandas DataFrame into a SQL¶

Key Statistics for Analysis:¶