In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
In [2]:
df = pd.read_csv("./data/StudentsPerformance.csv")
df.head(20)
Out[2]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
| 5 | female | group B | associate's degree | standard | none | 71 | 83 | 78 |
| 6 | female | group B | some college | standard | completed | 88 | 95 | 92 |
| 7 | male | group B | some college | free/reduced | none | 40 | 43 | 39 |
| 8 | male | group D | high school | free/reduced | completed | 64 | 64 | 67 |
| 9 | female | group B | high school | free/reduced | none | 38 | 60 | 50 |
| 10 | male | group C | associate's degree | standard | none | 58 | 54 | 52 |
| 11 | male | group D | associate's degree | standard | none | 40 | 52 | 43 |
| 12 | female | group B | high school | standard | none | 65 | 81 | 73 |
| 13 | male | group A | some college | standard | completed | 78 | 72 | 70 |
| 14 | female | group A | master's degree | standard | none | 50 | 53 | 58 |
| 15 | female | group C | some high school | standard | none | 69 | 75 | 78 |
| 16 | male | group C | high school | standard | none | 88 | 89 | 86 |
| 17 | female | group B | some high school | free/reduced | none | 18 | 32 | 28 |
| 18 | male | group C | master's degree | free/reduced | completed | 46 | 42 | 46 |
| 19 | female | group C | associate's degree | free/reduced | none | 54 | 58 | 61 |
In [3]:
df.shape
Out[3]:
(1000, 8)
In [4]:
df.describe()
Out[4]:
| math score | reading score | writing score | |
|---|---|---|---|
| count | 1000.00000 | 1000.000000 | 1000.000000 |
| mean | 66.08900 | 69.169000 | 68.054000 |
| std | 15.16308 | 14.600192 | 15.195657 |
| min | 0.00000 | 17.000000 | 10.000000 |
| 25% | 57.00000 | 59.000000 | 57.750000 |
| 50% | 66.00000 | 70.000000 | 69.000000 |
| 75% | 77.00000 | 79.000000 | 79.000000 |
| max | 100.00000 | 100.000000 | 100.000000 |
In [5]:
df["Average Score"]=((df["math score"]+df["reading score"]+df["writing score"])/3).round(5)
df.head(20)
Out[5]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | Average Score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 | 72.66667 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 | 82.33333 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 | 92.66667 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 | 49.33333 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 | 76.33333 |
| 5 | female | group B | associate's degree | standard | none | 71 | 83 | 78 | 77.33333 |
| 6 | female | group B | some college | standard | completed | 88 | 95 | 92 | 91.66667 |
| 7 | male | group B | some college | free/reduced | none | 40 | 43 | 39 | 40.66667 |
| 8 | male | group D | high school | free/reduced | completed | 64 | 64 | 67 | 65.00000 |
| 9 | female | group B | high school | free/reduced | none | 38 | 60 | 50 | 49.33333 |
| 10 | male | group C | associate's degree | standard | none | 58 | 54 | 52 | 54.66667 |
| 11 | male | group D | associate's degree | standard | none | 40 | 52 | 43 | 45.00000 |
| 12 | female | group B | high school | standard | none | 65 | 81 | 73 | 73.00000 |
| 13 | male | group A | some college | standard | completed | 78 | 72 | 70 | 73.33333 |
| 14 | female | group A | master's degree | standard | none | 50 | 53 | 58 | 53.66667 |
| 15 | female | group C | some high school | standard | none | 69 | 75 | 78 | 74.00000 |
| 16 | male | group C | high school | standard | none | 88 | 89 | 86 | 87.66667 |
| 17 | female | group B | some high school | free/reduced | none | 18 | 32 | 28 | 26.00000 |
| 18 | male | group C | master's degree | free/reduced | completed | 46 | 42 | 46 | 44.66667 |
| 19 | female | group C | associate's degree | free/reduced | none | 54 | 58 | 61 | 57.66667 |
In [6]:
df["gender"].value_counts()
Out[6]:
gender female 518 male 482 Name: count, dtype: int64
In [7]:
from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
df["gender"]=lc.fit_transform(df["gender"])
df["race/ethnicity"]=lc.fit_transform(df["race/ethnicity"])
df["parental level of education"]=lc.fit_transform(df["parental level of education"])
df["lunch"]=lc.fit_transform(df["lunch"])
df["test preparation course"]=lc.fit_transform(df["test preparation course"])
df.head(20)
Out[7]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | Average Score | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 1 | 1 | 72 | 72 | 74 | 72.66667 |
| 1 | 0 | 2 | 4 | 1 | 0 | 69 | 90 | 88 | 82.33333 |
| 2 | 0 | 1 | 3 | 1 | 1 | 90 | 95 | 93 | 92.66667 |
| 3 | 1 | 0 | 0 | 0 | 1 | 47 | 57 | 44 | 49.33333 |
| 4 | 1 | 2 | 4 | 1 | 1 | 76 | 78 | 75 | 76.33333 |
| 5 | 0 | 1 | 0 | 1 | 1 | 71 | 83 | 78 | 77.33333 |
| 6 | 0 | 1 | 4 | 1 | 0 | 88 | 95 | 92 | 91.66667 |
| 7 | 1 | 1 | 4 | 0 | 1 | 40 | 43 | 39 | 40.66667 |
| 8 | 1 | 3 | 2 | 0 | 0 | 64 | 64 | 67 | 65.00000 |
| 9 | 0 | 1 | 2 | 0 | 1 | 38 | 60 | 50 | 49.33333 |
| 10 | 1 | 2 | 0 | 1 | 1 | 58 | 54 | 52 | 54.66667 |
| 11 | 1 | 3 | 0 | 1 | 1 | 40 | 52 | 43 | 45.00000 |
| 12 | 0 | 1 | 2 | 1 | 1 | 65 | 81 | 73 | 73.00000 |
| 13 | 1 | 0 | 4 | 1 | 0 | 78 | 72 | 70 | 73.33333 |
| 14 | 0 | 0 | 3 | 1 | 1 | 50 | 53 | 58 | 53.66667 |
| 15 | 0 | 2 | 5 | 1 | 1 | 69 | 75 | 78 | 74.00000 |
| 16 | 1 | 2 | 2 | 1 | 1 | 88 | 89 | 86 | 87.66667 |
| 17 | 0 | 1 | 5 | 0 | 1 | 18 | 32 | 28 | 26.00000 |
| 18 | 1 | 2 | 3 | 0 | 0 | 46 | 42 | 46 | 44.66667 |
| 19 | 0 | 2 | 0 | 0 | 1 | 54 | 58 | 61 | 57.66667 |
In [8]:
sns.countplot(x=df["gender"] , hue = df["race/ethnicity"])
# 1 is male and 0 is female
Out[8]:
<Axes: xlabel='gender', ylabel='count'>
In [9]:
df["test preparation course"].value_counts()
# 1 denote not done and 0 denote the completed
Out[9]:
test preparation course 1 642 0 358 Name: count, dtype: int64
In [10]:
labels = ["not completed","completed"]
color=["red","green"]
# plt.figure(facecolor="green")
plt.pie(df["test preparation course"].value_counts(),labels=labels,colors=color)
Out[10]:
([<matplotlib.patches.Wedge at 0x16b168680>, <matplotlib.patches.Wedge at 0x16b1249e0>], [Text(-0.47460171119818767, 0.9923473261553901, 'not completed'), Text(0.4746018041084478, -0.9923472817199666, 'completed')])
In [11]:
plt.figure(figsize=(12,6))
sns.barplot(x="test preparation course",y="Average Score",data=df)
plt.show()
# 0 for done and 1 for not done so we can say who have done has better result
In [12]:
sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])
# plt.show()
# 1 is standard and 0 is free or reduced
/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/265420584.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])
Out[12]:
<Axes: xlabel='lunch', ylabel='Average Score'>
In [13]:
sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")
# 0=associate degree, 1=bachelor's degree,2=high school,3=master's degree,4=some college,5= some high school
/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/3868579111.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")
Out[13]:
<Axes: xlabel='parental level of education', ylabel='Average Score'>
In [14]:
plt.figure(figsize=(12,6))
sns.pairplot(df)
Out[14]:
<seaborn.axisgrid.PairGrid at 0x16b266480>
<Figure size 1200x600 with 0 Axes>
In [15]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr())
Out[15]:
<Axes: >
In [16]:
# Lets divide into target varaibel and predictor
y=df["Average Score"]#Target score
x=df.drop(["Average Score","reading score","writing score","math score"],axis=1)
y.shape
Out[16]:
(1000,)
In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
y_train.shape
Out[17]:
(700,)
In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
lr=LinearRegression()
lr.fit(X_train,y_train)
y_predict=lr.predict(X_test)
In [19]:
y_predict
Out[19]:
array([70.98127112, 66.27322644, 68.91170688, 56.90687812, 77.26157742,
60.61150762, 64.47634728, 74.5734257 , 58.67457349, 71.00885117,
53.91685753, 69.08094565, 67.15707412, 62.40838679, 75.32464328,
78.44729397, 68.46978304, 60.16958378, 70.70858605, 64.17608215,
66.09092506, 69.5359321 , 68.21016058, 61.35530033, 66.14059634,
69.35363073, 76.65041481, 70.98127112, 65.22013999, 62.40838679,
63.71206709, 75.32464328, 56.29571551, 69.5228695 , 69.5228695 ,
58.53451852, 60.61150762, 71.15050989, 69.65549959, 60.02952881,
75.15540451, 71.00885117, 76.65041481, 65.33970748, 60.61150762,
81.70999963, 59.72765994, 70.09742344, 74.88271944, 73.96226309,
71.45077501, 72.91820526, 73.38770915, 66.27322644, 68.77165191,
71.15050989, 68.91170688, 67.41669659, 65.47976245, 72.63649156,
69.07191703, 67.76823673, 70.98127112, 58.53451852, 67.02444402,
69.5359321 , 62.40838679, 57.7907258 , 70.14709471, 72.91820526,
79.02927279, 70.53934728, 56.90687812, 58.36527975, 72.94578531,
76.81965358, 78.88921781, 56.73763935, 64.89778364, 65.20707738,
58.84381226, 58.40188842, 66.97477275, 69.21357575, 65.03783861,
61.35530033, 78.88921781, 74.5734257 , 76.51035983, 53.03300985,
57.34880196, 73.08744403, 68.91170688, 55.85379167, 67.27664162,
69.35363073, 67.58593536, 65.33970748, 70.98127112, 69.97785594,
75.15540451, 69.5228695 , 63.54282832, 62.23914802, 62.65898063,
64.59591477, 71.29056486, 66.53284891, 75.89919722, 70.09742344,
63.73415831, 59.4183662 , 62.65898063, 66.66547901, 60.47145265,
66.53284891, 64.2036622 , 76.20849097, 72.33462269, 67.46636786,
57.03950822, 60.47145265, 68.0278592 , 67.76823673, 78.44729397,
67.14401152, 80.65691318, 76.51035983, 65.47976245, 72.94578531,
70.70858605, 69.35363073, 58.40188842, 81.26807579, 70.98127112,
61.35530033, 78.2780552 , 71.00885117, 53.47493369, 58.36527975,
66.11301629, 75.89919722, 66.14059634, 68.46978304, 74.88271944,
60.30221388, 61.33320911, 67.14401152, 68.51945432, 73.97129171,
81.70999963, 76.78304491, 67.41669659, 56.46495428, 71.59243373,
73.08744403, 63.29223447, 66.66547901, 69.84522584, 74.85513939,
78.2780552 , 72.33462269, 78.88921781, 64.2036622 , 75.32464328,
65.64900122, 65.52943373, 60.78074639, 78.2780552 , 69.04433698,
71.00885117, 71.29056486, 71.91479008, 70.14709471, 58.84381226,
61.96646295, 68.46978304, 68.65208442, 74.85513939, 76.51035983,
68.1604893 , 61.35530033, 70.58901855, 64.2036622 , 69.96479334,
70.40671718, 68.60241314, 62.0974893 , 66.53284891, 71.45077501,
62.26672807, 63.27014324, 58.67457349, 59.86029004, 66.41328141,
71.03094239, 79.02927279, 58.53451852, 70.98127112, 78.44729397,
71.59243373, 69.5359321 , 62.40838679, 57.34880196, 70.40671718,
55.85379167, 78.2780552 , 78.71997904, 76.20849097, 60.02952881,
63.27014324, 60.47145265, 76.81965358, 68.77165191, 67.27664162,
67.41669659, 69.35363073, 66.53284891, 78.88921781, 68.46978304,
61.77513295, 64.34532092, 76.95228368, 74.27155683, 55.85379167,
67.71856546, 71.59243373, 68.60241314, 61.96646295, 75.15540451,
67.58593536, 62.65898063, 64.17608215, 62.53941314, 67.58593536,
67.27664162, 67.41669659, 67.76823673, 66.83471777, 65.47976245,
54.80070522, 65.64900122, 66.71515028, 70.98127112, 77.26157742,
66.83471777, 58.67457349, 70.8396124 , 59.72765994, 79.02927279,
60.77171777, 74.5734257 , 60.91337649, 74.85513939, 63.27014324,
60.02952881, 67.15707412, 78.58734894, 68.46978304, 69.04433698,
52.59108601, 69.21357575, 80.82615195, 70.53934728, 68.21016058,
70.28714968, 69.5228695 , 69.04433698, 64.59591477, 75.15540451,
58.36527975, 70.70858605, 68.0278592 , 69.07191703, 57.34880196,
59.4183662 , 64.2036622 , 59.72765994, 55.85379167, 78.71997904,
61.79722418, 66.11301629, 67.41669659, 67.41669659, 56.90687812,
78.71997904, 71.15050989, 65.47976245, 68.46978304, 70.40671718,
64.59591477, 68.46978304, 76.78304491, 70.8396124 , 69.21357575,
65.22013999, 58.84381226, 52.59108601, 54.96994399, 76.95228368])
In [20]:
y_test
Out[20]:
521 87.00000
737 64.00000
740 75.00000
660 74.66667
411 81.66667
...
468 83.33333
935 63.66667
428 59.00000
7 40.66667
155 82.33333
Name: Average Score, Length: 300, dtype: float64
In [21]:
diff=abs(y_predict-y_test)
diff.mean()
Out[21]:
10.671228279143728