%cd /content
!git clone https://github.com/sshelton1/CMPS-3160-Crime-vs-Academics-Project

/content
Cloning into 'CMPS-3160-Crime-vs-Academics-Project'...
remote: Enumerating objects: 35, done.
remote: Counting objects: 100% (35/35), done.
remote: Compressing objects: 100% (26/26), done.
remote: Total 35 (delta 8), reused 25 (delta 4), pack-reused 0
Receiving objects: 100% (35/35), 1.01 MiB | 7.63 MiB/s, done.
Resolving deltas: 100% (8/8), done.

#Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

cd /content/CMPS-3160-Crime-vs-Academics-Project/data

/content/CMPS-3160-Crime-vs-Academics-Project/data

#Load School Performance Scores data into a Dataframe
school_scores = pd.read_csv('school_scores.csv')
school_scores.head()

id_vars = ["Year", "State.Code", "State.Name"] #Set our identifier variables
value_vars = school_scores.columns.difference(id_vars) #Use the remaining columns as varaibles

#Melt the data into tidy format so that each row forms a seperate obervation for a certain state in a certain year
school_scores_tidy = pd.melt(school_scores,
               id_vars=id_vars,
               value_vars=value_vars,
               var_name="Variable",
               value_name="Value")
school_scores_tidy.head()

SAT_Verbal = school_scores_tidy[(school_scores_tidy["Variable"] == "Total.Math")]
SAT_Verbal.head()

#Start by filtering the dataset to only include Values for Louisiana and Average English GPA
state = "LA"
LA_english_GPA = school_scores_tidy[(school_scores_tidy["State.Code"] == state) & (school_scores_tidy["Variable"] == "Academic Subjects.English.Average GPA")]
LA_english_GPA.head()

avg_LA_eng_GPA = LA_english_GPA["Value"].mean() #Takes the average of all the values in the filtered data frame
rounded_avg_eng_gpa = round(avg_LA_eng_GPA, 2)
print("Average English GPA for Louisiana:", rounded_avg_eng_gpa)

Average English GPA for Louisiana: 3.55

plt.plot(LA_english_GPA['Year'], LA_english_GPA['Value'], marker='o') #Creates a line graph plotting the average Enligsh GPA in Louisiana for each year in the data set
plt.xlabel('Year')
plt.ylabel('Average English GPA')
plt.title('Average English GPA in Louisiana 2005-2015')

Text(0.5, 1.0, 'Average English GPA in Louisiana 2005-2015')

#Load state crime data into a Dataframe
state_crime = pd.read_csv('state_crime.csv')
state_crime.head()

id_vars = ["Year", "State"]  #Set our identifier variables
value_vars1 = state_crime.columns.difference(id_vars)  #Use the remaining columns as varaibles

sc_t = pd.melt(state_crime,
               id_vars=id_vars,
               value_vars=value_vars1,
               var_name="Variable",
               value_name="Value")
sc_t.head()

# Filter the DataFrame to include only years between 2005 and 2015
sc_t_filtered = sc_t[(sc_t['Year'] >= 2005) & (sc_t['Year'] <= 2015)]

# Print the filtered DataFrame
sc_t_filtered

# Filter the DataFrame to include only rows where Variable is 'Academic Subjects.English.Average GPA'
english_gpa_df = school_scores_tidy[school_scores_tidy['Variable'] == 'Academic Subjects.English.Average GPA']

# Pivot the filtered DataFrame to have years as columns and English GPA as values
df_pivot = english_gpa_df.pivot_table(index='State.Name', columns='Year', values='Value', aggfunc='mean')

# Calculate the average GPA for all years for each state, skipping NaN values
df_pivot['Average_GPA_All_Years'] = df_pivot.mean(axis=1, skipna=True)

# Extract only the columns 'State.Name' and 'Average_GPA_All_Years'
average_englishgpa_by_state = df_pivot[['Average_GPA_All_Years']]

# Sort the DataFrame by the average GPA in descending order
average_englishgpa_by_state = average_englishgpa_by_state.sort_values(by='Average_GPA_All_Years', ascending=True)

# Display the DataFrame
average_englishgpa_by_state

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.bar(average_englishgpa_by_state.index, average_englishgpa_by_state['Average_GPA_All_Years'], color='skyblue')
plt.title('Average English GPA Across States Across 2005-2015')
plt.xlabel('State')
plt.ylabel('Average English GPA')
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

# Filter the DataFrame to include only rows where Variable is 'Academic Subjects.Mathematics.Average GPA'
math_gpa_df = school_scores_tidy[school_scores_tidy['Variable'] == 'Academic Subjects.Mathematics.Average GPA']

# Pivot the filtered DataFrame to have years as columns and Math GPA as values
df_pivot = math_gpa_df.pivot_table(index='State.Name', columns='Year', values='Value', aggfunc='mean')

# Calculate the average GPA for all years for each state, skipping NaN values
df_pivot['Average_GPA_All_Years'] = df_pivot.mean(axis=1, skipna=True)

# Extract only the columns 'State.Name' and 'Average_GPA_All_Years'
average_mgpa_by_state = df_pivot[['Average_GPA_All_Years']]

# Sort the DataFrame by the average GPA in descending order
average_mgpa_by_state = average_mgpa_by_state.sort_values(by='Average_GPA_All_Years', ascending=False)

# Display the DataFrame
average_mgpa_by_state

plt.figure(figsize=(10, 8))
plt.bar(average_mgpa_by_state.index, average_mgpa_by_state['Average_GPA_All_Years'], color='salmon')
plt.title('Average Mathematics GPA Across States Across 2006-2015')
plt.xlabel('State')
plt.ylabel('Average Mathematics GPA')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Filter the DataFrame for years 2005-2015 and 'Data.Rates.Property.All' variable
filtered_df = sc_t[(sc_t['Year'] >= 2005) & (sc_t['Year'] <= 2015) & (sc_t['Variable'] == 'Data.Rates.Property.All')]

average_prates_by_state = filtered_df.groupby('State')['Value'].mean()

average_prates_by_state = average_prates_by_state.sort_values(ascending=False)

average_prates_by_state

State
District of Columbia    4817.854545
South Carolina          3919.545455
Washington              3886.363636
Arizona                 3825.918182
Louisiana               3724.418182
New Mexico              3715.927273
Arkansas                3710.227273
Texas                   3658.836364
Tennessee               3651.145455
Georgia                 3629.654545
Hawaii                  3625.700000
Alabama                 3620.636364
Florida                 3613.527273
North Carolina          3549.872727
Oklahoma                3423.645455
Missouri                3409.127273
Oregon                  3307.863636
Delaware                3262.381818
Kansas                  3240.863636
Utah                    3228.118182
Ohio                    3226.454545
Nevada                  3177.127273
Indiana                 3100.290909
Maryland                3024.709091
Alaska                  3010.618182
United States           2984.972727
Mississippi             2975.927273
Colorado                2886.809091
Nebraska                2832.254545
California              2809.863636
Montana                 2669.881818
Minnesota               2668.036364
Michigan                2660.581818
Illinois                2639.200000
Rhode Island            2520.045455
Kentucky                2500.781818
Wyoming                 2495.590909
Wisconsin               2488.336364
Iowa                    2390.800000
Maine                   2356.936364
West Virginia           2337.618182
Virginia                2294.136364
Connecticut             2242.063636
Vermont                 2210.509091
Massachusetts           2202.972727
Pennsylvania            2201.263636
New Hampshire           2099.072727
Idaho                   2091.600000
New Jersey              2065.245455
North Dakota            2017.863636
New York                1910.336364
South Dakota            1842.145455
Name: Value, dtype: float64

# Plotting
plt.figure(figsize=(12, 8))
average_prates_by_state.plot(kind='bar', color='skyblue')
plt.title('Average Property Crime Rates (2005-2015) by State')
plt.xlabel('State')
plt.ylabel('Average Property Crime Rates')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Filter the DataFrame for years 2005-2015 and 'Data.Rates.Property.All' variable
filtered_df = sc_t[(sc_t['Year'] >= 2005) & (sc_t['Year'] <= 2015) & (sc_t['Variable'] == 'Data.Rates.Violent.All')]

average_vrates_by_state = filtered_df.groupby('State')['Value'].mean()

average_vrates_by_state = average_vrates_by_state.sort_values(ascending=False)

average_vrates_by_state

State
District of Columbia    1334.109091
Tennessee                667.063636
Nevada                   665.436364
Alaska                   647.436364
South Carolina           636.236364
New Mexico               622.600000
Delaware                 600.909091
Louisiana                595.690909
Florida                  587.745455
Maryland                 557.854545
Arkansas                 505.754545
Missouri                 487.063636
Michigan                 486.354545
Oklahoma                 475.054545
Illinois                 460.645455
California               459.954545
Texas                    458.790909
Arizona                  451.636364
Massachusetts            438.818182
Alabama                  435.818182
United States            420.463636
Georgia                  419.036364
New York                 402.681818
North Carolina           396.681818
Kansas                   387.154545
Pennsylvania             374.409091
Indiana                  342.181818
Colorado                 338.363636
Ohio                     321.218182
Washington               314.981818
Montana                  307.981818
New Jersey               307.490909
West Virginia            297.009091
Nebraska                 282.054545
Mississippi              281.972727
Iowa                     279.072727
South Dakota             277.618182
Connecticut              276.363636
Wisconsin                273.409091
Hawaii                   265.318182
Oregon                   261.990909
Minnesota                255.472727
Kentucky                 247.836364
Rhode Island             244.854545
Idaho                    229.663636
Virginia                 228.909091
North Dakota             227.836364
Wyoming                  222.881818
Utah                     220.354545
New Hampshire            174.336364
Vermont                  130.172727
Maine                    121.945455
Name: Value, dtype: float64

# Plotting
plt.figure(figsize=(12, 8))
average_vrates_by_state.plot(kind='bar', color='aquamarine')
plt.title('Average Violent Crime Rates (2005-2015) by State')
plt.xlabel('State')
plt.ylabel('Average Violent Crime Rates')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Convert both violent crime rate and property crime rate data series to DataFrames for alignment
average_vrates_df = average_vrates_by_state.reset_index()
average_prates_df = average_prates_by_state.reset_index()

# Merge the data on either 'State' or 'State.Name' column for Math GPA and violent crime rate
merged_data_math_vrates = pd.merge(average_mgpa_by_state, average_vrates_df, left_on='State.Name', right_on='State', how='inner')

# Calculate the Pearson correlation coefficient and p-value for Math GPA and violent crime rate
correlation_coefficient_math_vrates, p_value_math_vrates = pearsonr(merged_data_math_vrates['Average_GPA_All_Years'], merged_data_math_vrates['Value'])

# Print the correlation coefficient and p-value for Math GPA and violent crime rate
print("Pearson correlation coefficient for Math GPA and violent crime rate:", correlation_coefficient_math_vrates)
print("p-value for Math GPA and violent crime rate:", p_value_math_vrates)

# Check if the correlation coefficient is statistically significant (using a significance level of 0.05)
if p_value_math_vrates < 0.05:
    print("There is a statistically significant correlation between average Math GPA and violent crime rate across states.")
else:
    print("There is no statistically significant correlation between average Math GPA and violent crime rate across states.")

# Merge the data on either 'State' or 'State.Name' column for Math GPA and property crime rate
merged_data_math_prates = pd.merge(average_mgpa_by_state, average_prates_df, left_on='State.Name', right_on='State', how='inner')

# Calculate the Pearson correlation coefficient and p-value for Math GPA and property crime rate
correlation_coefficient_math_prates, p_value_math_prates = pearsonr(merged_data_math_prates['Average_GPA_All_Years'], merged_data_math_prates['Value'])

# Print the correlation coefficient and p-value for Math GPA and property crime rate
print("\nPearson correlation coefficient for Math GPA and property crime rate:", correlation_coefficient_math_prates)
print("p-value for Math GPA and property crime rate:", p_value_math_prates)

# Check if the correlation coefficient is statistically significant (using a significance level of 0.05)
if p_value_math_prates < 0.05:
    print("There is a statistically significant correlation between average Math GPA and property crime rate across states.")
else:
    print("There is no statistically significant correlation between average Math GPA and property crime rate across states.")

Pearson correlation coefficient for Math GPA and violent crime rate: -0.12821256583505453
p-value for Math GPA and violent crime rate: 0.3748962607667073
There is no statistically significant correlation between average Math GPA and violent crime rate across states.

Pearson correlation coefficient for Math GPA and property crime rate: -0.01345233656897091
p-value for Math GPA and property crime rate: 0.9261255298792865
There is no statistically significant correlation between average Math GPA and property crime rate across states.

# Convert both data series to DataFrames for alignment
average_prates_df = average_prates_by_state.reset_index()

# Merge the data on either 'State' or 'State.Name' column
merged_data_prates = pd.merge(average_englishgpa_df, average_prates_df, left_on='State.Name', right_on='State', how='inner')

# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient_prates, p_value_prates = pearsonr(merged_data_prates['Average_GPA_All_Years'], merged_data_prates['Value'])

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient for property crime rate:", correlation_coefficient_prates)
print("p-value for property crime rate:", p_value_prates)

# Check if the correlation coefficient is statistically significant (using a significance level of 0.05)
if p_value_prates < 0.05:
    print("There is a statistically significant correlation between average English GPA and property crime rate across states.")
else:
    print("There is no statistically significant correlation between average English GPA and property crime rate across states.")

Pearson correlation coefficient for property crime rate: 0.008777397335887768
p-value for property crime rate: 0.9517599015182187
There is no statistically significant correlation between average English GPA and property crime rate across states.

# Convert both data series to DataFrames for alignment
average_englishgpa_df = average_englishgpa_by_state.reset_index()
average_vrates_df = average_vrates_by_state.reset_index()

# Merge the data on either 'State' or 'State.Name' column
merged_data = pd.merge(average_englishgpa_df, average_vrates_df, left_on='State.Name', right_on='State', how='inner')

# Calculate the Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(merged_data['Average_GPA_All_Years'], merged_data['Value'])

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient:", correlation_coefficient)
print("p-value:", p_value)

# Check if the correlation coefficient is statistically significant (using a significance level of 0.05)
if p_value < 0.05:
    print("There is a statistically significant correlation between average English GPA and violent crime rate across states.")
else:
    print("There is no statistically significant correlation between average English GPA and violent crime rate across states.")

Pearson correlation coefficient: -0.12927683538609766
p-value: 0.3709115491626761
There is no statistically significant correlation between average English GPA and violent crime rate across states.

#Start by filtering the dataset to only include Values for Louisiana and Average English GPA
state = "LA"
LA_english_GPA = school_scores_tidy[(school_scores_tidy["State.Code"] == state) & (school_scores_tidy["Variable"] == "Academic Subjects.English.Average GPA")]

plt.plot(LA_english_GPA['Year'], LA_english_GPA['Value'], marker='o', label='Average English GPA') #Creates a line graph plotting the average Enligsh GPA in Louisiana for each year in the data set


LA_math_GPA = school_scores_tidy[(school_scores_tidy["State.Code"] == state) & (school_scores_tidy["Variable"] == "Academic Subjects.Mathematics.Average GPA")]

# Plot Average Math GPA
plt.plot(LA_math_GPA['Year'], LA_math_GPA['Value'], marker='o', label='Average Math GPA')

# Labeling the plot
plt.xlabel('Year')
plt.ylabel('Average GPA')
plt.title('Average English and Math GPA in Louisiana 2005-2015')
plt.legend()  # Show legend with labels for each line

# Show the plot
plt.show()

import matplotlib.pyplot as plt

# Define the state as Louisiana
state = 'Louisiana'

# Filter the dataset for Louisiana and Data.Rates.Property.All for years 2005-2015
LA_property_rates = sc_t[(sc_t["State"] == state) &
                         (sc_t["Variable"] == "Data.Rates.Property.All") &
                         (sc_t["Year"] >= 2005) & (sc_t["Year"] <= 2015)]

# Plot Data.Rates.Property.All
plt.plot(LA_property_rates['Year'], LA_property_rates['Value'], marker='o', label='Property Crime Rate')

# Filter the dataset for Louisiana and Data.Rates.Violent.All for years 2005-2015
LA_violent_all = sc_t[(sc_t["State"] == state) &
                          (sc_t["Variable"] == "Data.Rates.Violent.All") &
                          (sc_t["Year"] >= 2005) & (sc_t["Year"] <= 2015)]

# Plot Data.Rates.ViolentAll
plt.plot(LA_violent_all['Year'], LA_violent_all['Value'], marker='o', label='Violent Crime Rate')

# Labeling the plot
plt.xlabel('Year')
plt.ylabel('Property Data')
plt.title('Property Crime Rate and Violent Crime Rate in Louisiana (2005-2015)')
plt.legend()  # Show legend with labels for each line

# Show the plot
plt.show()

# Calculate total crime rate by summing property and violent crime rates
LA_total_crime_rates = LA_property_rates['Value'].values + LA_violent_all['Value'].values

# Plot Total Crime Rate
plt.plot(LA_property_rates['Year'], LA_total_crime_rates, marker='o', label='Total Crime Rate')

# Labeling the plot
plt.xlabel('Year')
plt.ylabel('Total Crime Rate')
plt.title('Total Crime Rate in Louisiana (2005-2015)')
plt.legend()  # Show legend with labels for each line

# Show the plot
plt.show()

# Filter the DataFrame to include only rows where Variable is 'Total.Math'
math_sat_df = school_scores_tidy[school_scores_tidy['Variable'] == 'Total.Math']

# Pivot the filtered DataFrame to have years as columns and Math SAT scores as values
df_pivot_sat = math_sat_df.pivot_table(index='State.Name', columns='Year', values='Value', aggfunc='mean')

# Calculate the average SAT score for all years for each state, skipping NaN values
df_pivot_sat['Average_SAT_All_Years'] = df_pivot_sat.mean(axis=1, skipna=True)

# Extract only the columns 'State.Name' and 'Average_SAT_All_Years'
average_sat_by_state = df_pivot_sat[['Average_SAT_All_Years']]

# Sort the DataFrame by the average SAT score in descending order
average_sat_by_state.sort_values(by='Average_SAT_All_Years', ascending=False)

average_sat_by_state.head()

# Filter the DataFrame to include only rows where Variable is 'Total.Verbal'
verbal_sat_df = school_scores_tidy[school_scores_tidy['Variable'] == 'Total.Verbal']

# Pivot the filtered DataFrame to have years as columns and Verbal SAT scores as values
df_pivot_vsat = verbal_sat_df.pivot_table(index='State.Name', columns='Year', values='Value', aggfunc='mean')

# Calculate the average SAT score for all years for each state, skipping NaN values
df_pivot_vsat['Average_VSAT_All_Years'] = df_pivot_vsat.mean(axis=1, skipna=True)

# Extract only the columns 'State.Name' and 'Average_SAT_All_Years'
average_vsat_by_state = df_pivot_vsat[['Average_VSAT_All_Years']]

# Sort the DataFrame by the average SAT score in descending order
average_vsat_by_state.sort_values(by='Average_VSAT_All_Years', ascending=False)

average_vsat_by_state.head()

import matplotlib.pyplot as plt

# Create a bar plot
plt.figure(figsize=(12, 6))
average_sat_by_state['Average_SAT_All_Years'].plot(kind='bar', color='pink')

# Add titles and labels
plt.title('Average Math SAT Scores Across States')
plt.xlabel('State')
plt.ylabel('Average Math SAT Score')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

sc_t_filtered = sc_t[(sc_t['Year'] >= 2005) & (sc_t['Year'] <= 2015)] #obtain original filtered dataframe

SAT_Math = school_scores_tidy[(school_scores_tidy["Variable"] == "Total.Math")] #Obtain data frame with only the Math SAT scores
SAT_Math = SAT_Math.rename(columns={'State.Name': 'State'}) #rename columns for easier merging with state crime dataset

SAT_Verbal = school_scores_tidy[(school_scores_tidy["Variable"] == "Total.Verbal")] #Repeat same for Verbal SAT scores
SAT_Verbal= SAT_Verbal.rename(columns={'State.Name': 'State'})

import pandas as pd

# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Totals.Violent.Robbery'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Calculate the correlation coefficient
correlation = violent_data['Value_crime'].corr(violent_data['Value_sat'])

# Display the correlation coefficient
print("Correlation coefficient between Violent Crime and Math SAT scores:", correlation)

Correlation coefficient between Violent Crime and Math SAT scores: -0.1092578620814315

from scipy.stats import pearsonr

# Calculate Pearson correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(violent_data['Value_crime'], violent_data['Value_sat'])

# Check if the p-value is less than the significance level (e.g., 0.05)
significance_level = 0.05
if p_value < significance_level:
    print("The correlation is statistically significant.")
    print("P-value:", p_value)
else:
    print("The correlation is not statistically significant.")
    print("P-value:", p_value)

The correlation is statistically significant.
P-value: 0.010482676794930321

import pandas as pd
from scipy.stats import pearsonr


# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Property.All'
property_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Property.All']

# Calculate the correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(property_data['Value_crime'], property_data['Value_sat'])

# Check if the p-value is less than the significance level (e.g., 0.05)
significance_level = 0.05
if p_value < significance_level:
    print("The correlation between Property Crime Rates and Math SAT scores is statistically significant.")
else:
    print("The correlation between Property Crime Rates and Math SAT scores is not statistically significant.")

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

The correlation between Property Crime Rates and Math SAT scores is statistically significant.
Pearson correlation coefficient: -0.09748918336435577
P-value: 0.02246625932822033

import pandas as pd
import matplotlib.pyplot as plt

# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Property.All'
property_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Property.All']

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(property_data['Value_crime'], property_data['Value_sat'], color='salmon', alpha=0.5)
plt.title('Relationship between State Property Crime Rates and State Math SAT Scores')
plt.xlabel('Property Crime Rates')
plt.ylabel('Math SAT Scores')
plt.grid(True)
plt.show()

import pandas as pd
import matplotlib.pyplot as plt


# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Create a scatter plot for violent crime rates and Math SAT scores
plt.figure(figsize=(10, 6))
plt.scatter(violent_data['Value_crime'], violent_data['Value_sat'], color='purple', alpha=0.5)
plt.title('Relationship between Violent Crime Rates and Math SAT Scores')
plt.xlabel('Violent Crime Rates')
plt.ylabel('Math SAT Scores')
plt.grid(True)
plt.show()

import pandas as pd

# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Verbal, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Totals.Violent.Robbery'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

correlation_coefficient, p_value = pearsonr(violent_data['Value_crime'], violent_data['Value_sat'])

# Check if the p-value is less than the significance level (e.g., 0.05)
significance_level = 0.05
if p_value < significance_level:
    print("The correlation between Violent Crime Rates and Verbal SAT scores is statistically significant.")
else:
    print("The correlation between Violent Crime Rates and Verbal SAT scores is not statistically significant.")

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

The correlation between Violent Crime Rates and Verbal SAT scores is statistically significant.
Pearson correlation coefficient: -0.09499449343510703
P-value: 0.02616749027117341

import pandas as pd
from scipy.stats import pearsonr


# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Verbal, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Property.All'
property_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Property.All']

# Calculate the correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(property_data['Value_crime'], property_data['Value_sat'])

# Check if the p-value is less than the significance level (e.g., 0.05)
significance_level = 0.05
if p_value < significance_level:
    print("The correlation between Property Crime Rates and Verbal SAT scores is statistically significant.")
else:
    print("The correlation between Property Crime Rates and Verbal SAT scores is not statistically significant.")

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

The correlation between Property Crime Rates and Verbal SAT scores is not statistically significant.
Pearson correlation coefficient: -0.06714393083411004
P-value: 0.11641926666809245

import pandas as pd
import matplotlib.pyplot as plt

# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Verbal, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Filter rows where the Variable is 'Data.Rates.Property.All'
property_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Property.All']

# Create scatter plot for 'Data.Rates.Violent.All' and Verbal SAT scores
plt.figure(figsize=(10, 6))
plt.scatter(violent_data['Value_crime'], violent_data['Value_sat'], color='purple', alpha=0.5)
plt.title('Relationship between Violent Crime Rates and Verbal SAT Scores')
plt.xlabel('Violent Crime Rates')
plt.ylabel('Verbal SAT Scores')
plt.grid(True)
plt.show()

# Create scatter plot for 'Data.Rates.Property.All' and Verbal SAT scores
plt.figure(figsize=(10, 6))
plt.scatter(property_data['Value_crime'], property_data['Value_sat'], color='hotpink', alpha=0.5)
plt.title('Relationship between Property Crime Rates and Verbal SAT Scores')
plt.xlabel('Property Crime Rates')
plt.ylabel('Verbal SAT Scores')
plt.grid(True)
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Merge DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Verbal, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Define our independent variable (violent crime rates) and dependent variable (Verbal SAT scores)
X = violent_data[['Value_crime']]
y = violent_data['Value_sat']

# Split the data into our training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calc mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1570.3583224821539
R-squared: 0.02331744320660878

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Define the independent variable (violent crime rates) and dependent variable (Math SAT scores)
X = violent_data[['Value_crime']]
y = violent_data['Value_sat']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1563.1446430586477
R-squared: 0.03226531736360605

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Merge the two DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All' and 'Data.Rates.Property.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']
property_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Property.All']

# Combine the two datasets on 'Year' and 'State'
combined_data = pd.merge(violent_data, property_data, on=['Year', 'State'], suffixes=('_violent', '_property'))

# Filter columns based on their names
X_cols = combined_data.filter(like='Value_crime').columns
y_col = combined_data.filter(like='Value_sat').columns[0]  # Assuming there's only one SAT score column

# Define the independent variables (violent crime rate and property crime rate) and dependent variable (Math SAT scores)
X = combined_data[X_cols]
y = combined_data[y_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1564.9969423935395
R-squared: 0.031118568521802392

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Merge DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Verbal, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Define our independent variable (Verbal SAT scores) and dependent variable (violent crime rates)
X = violent_data[['Value_sat']]
y = violent_data['Value_crime']

# Split the data into our training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 23012.067505107178
R-squared: -0.0031964986166197473

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Merge DataFrames on 'Year' and 'State'
merged_df = pd.merge(sc_t_filtered, SAT_Math, on=['Year', 'State'], suffixes=('_crime', '_sat'))

# Filter rows where the Variable is 'Data.Rates.Violent.All'
violent_data = merged_df[merged_df['Variable_crime'] == 'Data.Rates.Violent.All']

# Define our independent variable (Math SAT scores) and dependent variable (violent crime rates)
X = violent_data[['Value_sat']]
y = violent_data['Value_crime']

# Split the data into our training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 22796.51092723738
R-squared: 0.00620055378316009

	Year	State.Code	State.Name	Total.Math	Total.Test-takers	Total.Verbal	Academic Subjects.Arts/Music.Average GPA	Academic Subjects.Arts/Music.Average Years	Academic Subjects.English.Average GPA	Academic Subjects.English.Average Years	...	Score Ranges.Between 600 to 700.Math.Total	Score Ranges.Between 600 to 700.Verbal.Females	Score Ranges.Between 600 to 700.Verbal.Males	Score Ranges.Between 600 to 700.Verbal.Total	Score Ranges.Between 700 to 800.Math.Females	Score Ranges.Between 700 to 800.Math.Males	Score Ranges.Between 700 to 800.Math.Total	Score Ranges.Between 700 to 800.Verbal.Females	Score Ranges.Between 700 to 800.Verbal.Males	Score Ranges.Between 700 to 800.Verbal.Total
0	2005	AL	Alabama	559	3985	567	3.92	2.2	3.53	3.9	...	1096	596	613	1209	156	292	448	219	250	469
1	2005	AK	Alaska	519	3996	523	3.76	1.9	3.35	3.9	...	787	423	375	798	47	116	163	109	115	224
2	2005	AZ	Arizona	530	18184	526	3.85	2.1	3.45	3.9	...	3814	1831	1679	3510	327	630	957	412	501	913
3	2005	AR	Arkansas	552	1600	563	3.90	2.2	3.61	4.0	...	443	242	226	468	49	83	132	77	93	170
4	2005	CA	California	522	186552	504	3.76	1.8	3.32	3.8	...	37605	16078	14966	31044	5451	8792	14243	5011	4766	9777

	State	Year	Data.Population	Data.Rates.Property.All	Data.Rates.Property.Burglary	Data.Rates.Property.Larceny	Data.Rates.Property.Motor	Data.Rates.Violent.All	Data.Rates.Violent.Assault	Data.Rates.Violent.Murder	...	Data.Rates.Violent.Robbery	Data.Totals.Property.All	Data.Totals.Property.Burglary	Data.Totals.Property.Larceny	Data.Totals.Property.Motor	Data.Totals.Violent.All	Data.Totals.Violent.Assault	Data.Totals.Violent.Murder	Data.Totals.Violent.Rape	Data.Totals.Violent.Robbery
0	Alabama	1960	3266740	1035.4	355.9	592.1	87.3	186.6	138.1	12.4	...	27.5	33823	11626	19344	2853	6097	4512	406	281	898
1	Alabama	1961	3302000	985.5	339.3	569.4	76.8	168.5	128.9	12.9	...	19.1	32541	11205	18801	2535	5564	4255	427	252	630
2	Alabama	1962	3358000	1067.0	349.1	634.5	83.4	157.3	119.0	9.4	...	22.5	35829	11722	21306	2801	5283	3995	316	218	754
3	Alabama	1963	3347000	1150.9	376.9	683.4	90.6	182.7	142.1	10.2	...	24.7	38521	12614	22874	3033	6115	4755	340	192	828
4	Alabama	1964	3407000	1358.7	466.6	784.1	108.0	213.1	163.0	9.3	...	29.1	46290	15898	26713	3679	7260	5555	316	397	992

	Year	State	Variable	Value
0	1960	Alabama	Data.Population	3266740.0
1	1961	Alabama	Data.Population	3302000.0
2	1962	Alabama	Data.Population	3358000.0
3	1963	Alabama	Data.Population	3347000.0
4	1964	Alabama	Data.Population	3407000.0

Year	Average_GPA_All_Years
State.Name
Virgin Islands	3.113333
District Of Columbia	3.129091
Massachusetts	3.226364
Connecticut	3.226364
Rhode Island	3.259091
Maryland	3.280909
Hawaii	3.292727
Delaware	3.295455
New Jersey	3.310000
New York	3.318182
New Hampshire	3.330000
Indiana	3.331818
Florida	3.364545
California	3.371818
Maine	3.378182
South Carolina	3.383636
Georgia	3.392727
Vermont	3.395455
Virginia	3.413636
Pennsylvania	3.415455
Texas	3.426364
Nevada	3.431818
Alaska	3.450909
North Carolina	3.454545
Arizona	3.483636
Washington	3.502727
Idaho	3.504545
Oregon	3.506364
Ohio	3.511818
New Mexico	3.529091
Montana	3.551818
Louisiana	3.552727
Colorado	3.594545
Alabama	3.594545
Puerto Rico	3.605556
Illinois	3.613636
West Virginia	3.633333
Tennessee	3.637273
Missouri	3.657273
Kansas	3.676364
Arkansas	3.680909
Oklahoma	3.691818
Kentucky	3.697273
Minnesota	3.700909
Utah	3.702727
Mississippi	3.703636
Michigan	3.710909
Wisconsin	3.711818
Nebraska	3.725455
Wyoming	3.733636
South Dakota	3.744545
Iowa	3.777273
North Dakota	3.794545

Year	Average_GPA_All_Years
State.Name
North Dakota	3.643636
South Dakota	3.623636
Mississippi	3.618182
Iowa	3.610000
Wisconsin	3.577273
Michigan	3.570000
Kentucky	3.567273
Nebraska	3.563636
Minnesota	3.560000
Oklahoma	3.550000
Arkansas	3.549091
Missouri	3.545455
Wyoming	3.541818
Kansas	3.521818
Tennessee	3.509091
Illinois	3.501818
Utah	3.497273
Puerto Rico	3.444444
Alabama	3.430909
Louisiana	3.430000
West Virginia	3.404444
Colorado	3.401818
Ohio	3.381818
New Mexico	3.337273
Montana	3.276364
Oregon	3.262727
Arizona	3.261818
Idaho	3.243636
Pennsylvania	3.240000
Texas	3.234545
North Carolina	3.223636
South Carolina	3.219091
Washington	3.212727
Maine	3.210000
Vermont	3.191818
Georgia	3.176364
Virginia	3.173636
New Jersey	3.160909
Delaware	3.139091
Alaska	3.130909
New Hampshire	3.099091
Massachusetts	3.098182
Florida	3.097273
Nevada	3.088182
Indiana	3.087273
Connecticut	3.072727
California	3.072727
Maryland	3.055455
Rhode Island	3.051818
New York	3.041818
Hawaii	3.039091
District Of Columbia	2.973636
Virgin Islands	2.897778

Analyzing the Relationship Between State School Performance and Crime Rates¶

Rachel Roberts and Shayne Shelton¶

Datasets¶

Research Questions¶

Loading and Parsing Datasets¶

Additional Extraction, Transform, and Load (ETL) + Exploratory Data Analysis (EDA)¶

Deailed Analysis and Model Building¶

Correlation Between Violent Crime and Math SAT scores¶

Correlation Between Property Crime and Math SAT scores¶

Correlation between Violent Crime Rates and Verbal SAT scores¶

Linear Regression Modeling¶

Conclusions and Discussion¶

	Year	State.Code	State.Name	Variable	Value
53661	2005	AL	Alabama	Total.Math	559.0
53662	2005	AK	Alaska	Total.Math	519.0
53663	2005	AZ	Arizona	Total.Math	530.0
53664	2005	AR	Arkansas	Total.Math	552.0
53665	2005	CA	California	Total.Math	522.0

	Year	State.Code	State.Name	Variable	Value
1172	2005	LA	Louisiana	Academic Subjects.English.Average GPA	3.46
1224	2006	LA	Louisiana	Academic Subjects.English.Average GPA	3.52
1276	2007	LA	Louisiana	Academic Subjects.English.Average GPA	3.54
1329	2008	LA	Louisiana	Academic Subjects.English.Average GPA	3.53
1382	2009	LA	Louisiana	Academic Subjects.English.Average GPA	3.54

Year	Average_SAT_All_Years
State.Name
Alabama	548.909091
Alaska	516.909091
Arizona	526.818182
Arkansas	568.818182
California	515.818182

Year	Average_VSAT_All_Years
State.Name
Alabama	554.545455
Alaska	518.818182
Arizona	519.727273
Arkansas	570.636364
California	499.181818