In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
In [2]:
df = pd.read_csv("./data/StudentsPerformance.csv")
df.head(20)
Out[2]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
5 female group B associate's degree standard none 71 83 78
6 female group B some college standard completed 88 95 92
7 male group B some college free/reduced none 40 43 39
8 male group D high school free/reduced completed 64 64 67
9 female group B high school free/reduced none 38 60 50
10 male group C associate's degree standard none 58 54 52
11 male group D associate's degree standard none 40 52 43
12 female group B high school standard none 65 81 73
13 male group A some college standard completed 78 72 70
14 female group A master's degree standard none 50 53 58
15 female group C some high school standard none 69 75 78
16 male group C high school standard none 88 89 86
17 female group B some high school free/reduced none 18 32 28
18 male group C master's degree free/reduced completed 46 42 46
19 female group C associate's degree free/reduced none 54 58 61
In [3]:
df.shape
Out[3]:
(1000, 8)
In [4]:
df.describe()
Out[4]:
math score reading score writing score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
In [5]:
df["Average Score"]=((df["math score"]+df["reading score"]+df["writing score"])/3).round(5)
df.head(20)
Out[5]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score Average Score
0 female group B bachelor's degree standard none 72 72 74 72.66667
1 female group C some college standard completed 69 90 88 82.33333
2 female group B master's degree standard none 90 95 93 92.66667
3 male group A associate's degree free/reduced none 47 57 44 49.33333
4 male group C some college standard none 76 78 75 76.33333
5 female group B associate's degree standard none 71 83 78 77.33333
6 female group B some college standard completed 88 95 92 91.66667
7 male group B some college free/reduced none 40 43 39 40.66667
8 male group D high school free/reduced completed 64 64 67 65.00000
9 female group B high school free/reduced none 38 60 50 49.33333
10 male group C associate's degree standard none 58 54 52 54.66667
11 male group D associate's degree standard none 40 52 43 45.00000
12 female group B high school standard none 65 81 73 73.00000
13 male group A some college standard completed 78 72 70 73.33333
14 female group A master's degree standard none 50 53 58 53.66667
15 female group C some high school standard none 69 75 78 74.00000
16 male group C high school standard none 88 89 86 87.66667
17 female group B some high school free/reduced none 18 32 28 26.00000
18 male group C master's degree free/reduced completed 46 42 46 44.66667
19 female group C associate's degree free/reduced none 54 58 61 57.66667
In [6]:
df["gender"].value_counts()
Out[6]:
gender
female    518
male      482
Name: count, dtype: int64
In [7]:
from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
df["gender"]=lc.fit_transform(df["gender"])
df["race/ethnicity"]=lc.fit_transform(df["race/ethnicity"])
df["parental level of education"]=lc.fit_transform(df["parental level of education"])
df["lunch"]=lc.fit_transform(df["lunch"])
df["test preparation course"]=lc.fit_transform(df["test preparation course"])
df.head(20)
Out[7]:
gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score Average Score
0 0 1 1 1 1 72 72 74 72.66667
1 0 2 4 1 0 69 90 88 82.33333
2 0 1 3 1 1 90 95 93 92.66667
3 1 0 0 0 1 47 57 44 49.33333
4 1 2 4 1 1 76 78 75 76.33333
5 0 1 0 1 1 71 83 78 77.33333
6 0 1 4 1 0 88 95 92 91.66667
7 1 1 4 0 1 40 43 39 40.66667
8 1 3 2 0 0 64 64 67 65.00000
9 0 1 2 0 1 38 60 50 49.33333
10 1 2 0 1 1 58 54 52 54.66667
11 1 3 0 1 1 40 52 43 45.00000
12 0 1 2 1 1 65 81 73 73.00000
13 1 0 4 1 0 78 72 70 73.33333
14 0 0 3 1 1 50 53 58 53.66667
15 0 2 5 1 1 69 75 78 74.00000
16 1 2 2 1 1 88 89 86 87.66667
17 0 1 5 0 1 18 32 28 26.00000
18 1 2 3 0 0 46 42 46 44.66667
19 0 2 0 0 1 54 58 61 57.66667
In [8]:
sns.countplot(x=df["gender"] , hue = df["race/ethnicity"])
# 1 is male and 0 is female
Out[8]:
<Axes: xlabel='gender', ylabel='count'>
No description has been provided for this image
In [9]:
df["test preparation course"].value_counts()
# 1 denote not done and 0 denote the completed
Out[9]:
test preparation course
1    642
0    358
Name: count, dtype: int64
In [10]:
labels = ["not completed","completed"]
color=["red","green"]
# plt.figure(facecolor="green")
plt.pie(df["test preparation course"].value_counts(),labels=labels,colors=color)
Out[10]:
([<matplotlib.patches.Wedge at 0x16b168680>,
  <matplotlib.patches.Wedge at 0x16b1249e0>],
 [Text(-0.47460171119818767, 0.9923473261553901, 'not completed'),
  Text(0.4746018041084478, -0.9923472817199666, 'completed')])
No description has been provided for this image
In [11]:
plt.figure(figsize=(12,6))
sns.barplot(x="test preparation course",y="Average Score",data=df)
plt.show()
# 0 for done and 1 for not done so we can say who have done has better result
No description has been provided for this image
In [12]:
sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])
# plt.show()
# 1 is standard and 0 is free or reduced
/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/265420584.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])
Out[12]:
<Axes: xlabel='lunch', ylabel='Average Score'>
No description has been provided for this image
In [13]:
sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")
# 0=associate degree, 1=bachelor's degree,2=high school,3=master's degree,4=some college,5= some high school
/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/3868579111.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")
Out[13]:
<Axes: xlabel='parental level of education', ylabel='Average Score'>
No description has been provided for this image
In [14]:
plt.figure(figsize=(12,6))
sns.pairplot(df)
Out[14]:
<seaborn.axisgrid.PairGrid at 0x16b266480>
<Figure size 1200x600 with 0 Axes>
No description has been provided for this image
In [15]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr())
Out[15]:
<Axes: >
No description has been provided for this image
In [16]:
# Lets divide into target varaibel and predictor
y=df["Average Score"]#Target score
x=df.drop(["Average Score","reading score","writing score","math score"],axis=1)
y.shape
Out[16]:
(1000,)
In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
y_train.shape
Out[17]:
(700,)
In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
lr=LinearRegression()
lr.fit(X_train,y_train)
y_predict=lr.predict(X_test)
In [19]:
y_predict
Out[19]:
array([70.98127112, 66.27322644, 68.91170688, 56.90687812, 77.26157742,
       60.61150762, 64.47634728, 74.5734257 , 58.67457349, 71.00885117,
       53.91685753, 69.08094565, 67.15707412, 62.40838679, 75.32464328,
       78.44729397, 68.46978304, 60.16958378, 70.70858605, 64.17608215,
       66.09092506, 69.5359321 , 68.21016058, 61.35530033, 66.14059634,
       69.35363073, 76.65041481, 70.98127112, 65.22013999, 62.40838679,
       63.71206709, 75.32464328, 56.29571551, 69.5228695 , 69.5228695 ,
       58.53451852, 60.61150762, 71.15050989, 69.65549959, 60.02952881,
       75.15540451, 71.00885117, 76.65041481, 65.33970748, 60.61150762,
       81.70999963, 59.72765994, 70.09742344, 74.88271944, 73.96226309,
       71.45077501, 72.91820526, 73.38770915, 66.27322644, 68.77165191,
       71.15050989, 68.91170688, 67.41669659, 65.47976245, 72.63649156,
       69.07191703, 67.76823673, 70.98127112, 58.53451852, 67.02444402,
       69.5359321 , 62.40838679, 57.7907258 , 70.14709471, 72.91820526,
       79.02927279, 70.53934728, 56.90687812, 58.36527975, 72.94578531,
       76.81965358, 78.88921781, 56.73763935, 64.89778364, 65.20707738,
       58.84381226, 58.40188842, 66.97477275, 69.21357575, 65.03783861,
       61.35530033, 78.88921781, 74.5734257 , 76.51035983, 53.03300985,
       57.34880196, 73.08744403, 68.91170688, 55.85379167, 67.27664162,
       69.35363073, 67.58593536, 65.33970748, 70.98127112, 69.97785594,
       75.15540451, 69.5228695 , 63.54282832, 62.23914802, 62.65898063,
       64.59591477, 71.29056486, 66.53284891, 75.89919722, 70.09742344,
       63.73415831, 59.4183662 , 62.65898063, 66.66547901, 60.47145265,
       66.53284891, 64.2036622 , 76.20849097, 72.33462269, 67.46636786,
       57.03950822, 60.47145265, 68.0278592 , 67.76823673, 78.44729397,
       67.14401152, 80.65691318, 76.51035983, 65.47976245, 72.94578531,
       70.70858605, 69.35363073, 58.40188842, 81.26807579, 70.98127112,
       61.35530033, 78.2780552 , 71.00885117, 53.47493369, 58.36527975,
       66.11301629, 75.89919722, 66.14059634, 68.46978304, 74.88271944,
       60.30221388, 61.33320911, 67.14401152, 68.51945432, 73.97129171,
       81.70999963, 76.78304491, 67.41669659, 56.46495428, 71.59243373,
       73.08744403, 63.29223447, 66.66547901, 69.84522584, 74.85513939,
       78.2780552 , 72.33462269, 78.88921781, 64.2036622 , 75.32464328,
       65.64900122, 65.52943373, 60.78074639, 78.2780552 , 69.04433698,
       71.00885117, 71.29056486, 71.91479008, 70.14709471, 58.84381226,
       61.96646295, 68.46978304, 68.65208442, 74.85513939, 76.51035983,
       68.1604893 , 61.35530033, 70.58901855, 64.2036622 , 69.96479334,
       70.40671718, 68.60241314, 62.0974893 , 66.53284891, 71.45077501,
       62.26672807, 63.27014324, 58.67457349, 59.86029004, 66.41328141,
       71.03094239, 79.02927279, 58.53451852, 70.98127112, 78.44729397,
       71.59243373, 69.5359321 , 62.40838679, 57.34880196, 70.40671718,
       55.85379167, 78.2780552 , 78.71997904, 76.20849097, 60.02952881,
       63.27014324, 60.47145265, 76.81965358, 68.77165191, 67.27664162,
       67.41669659, 69.35363073, 66.53284891, 78.88921781, 68.46978304,
       61.77513295, 64.34532092, 76.95228368, 74.27155683, 55.85379167,
       67.71856546, 71.59243373, 68.60241314, 61.96646295, 75.15540451,
       67.58593536, 62.65898063, 64.17608215, 62.53941314, 67.58593536,
       67.27664162, 67.41669659, 67.76823673, 66.83471777, 65.47976245,
       54.80070522, 65.64900122, 66.71515028, 70.98127112, 77.26157742,
       66.83471777, 58.67457349, 70.8396124 , 59.72765994, 79.02927279,
       60.77171777, 74.5734257 , 60.91337649, 74.85513939, 63.27014324,
       60.02952881, 67.15707412, 78.58734894, 68.46978304, 69.04433698,
       52.59108601, 69.21357575, 80.82615195, 70.53934728, 68.21016058,
       70.28714968, 69.5228695 , 69.04433698, 64.59591477, 75.15540451,
       58.36527975, 70.70858605, 68.0278592 , 69.07191703, 57.34880196,
       59.4183662 , 64.2036622 , 59.72765994, 55.85379167, 78.71997904,
       61.79722418, 66.11301629, 67.41669659, 67.41669659, 56.90687812,
       78.71997904, 71.15050989, 65.47976245, 68.46978304, 70.40671718,
       64.59591477, 68.46978304, 76.78304491, 70.8396124 , 69.21357575,
       65.22013999, 58.84381226, 52.59108601, 54.96994399, 76.95228368])
In [20]:
y_test
Out[20]:
521    87.00000
737    64.00000
740    75.00000
660    74.66667
411    81.66667
         ...   
468    83.33333
935    63.66667
428    59.00000
7      40.66667
155    82.33333
Name: Average Score, Length: 300, dtype: float64
In [21]:
diff=abs(y_predict-y_test)
diff.mean()
Out[21]:
10.671228279143728