import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

df = pd.read_csv("./data/StudentsPerformance.csv")
df.head(20)

df.shape

(1000, 8)

df.describe()

df["Average Score"]=((df["math score"]+df["reading score"]+df["writing score"])/3).round(5)
df.head(20)

df["gender"].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
df["gender"]=lc.fit_transform(df["gender"])
df["race/ethnicity"]=lc.fit_transform(df["race/ethnicity"])
df["parental level of education"]=lc.fit_transform(df["parental level of education"])
df["lunch"]=lc.fit_transform(df["lunch"])
df["test preparation course"]=lc.fit_transform(df["test preparation course"])
df.head(20)

sns.countplot(x=df["gender"] , hue = df["race/ethnicity"])
# 1 is male and 0 is female

<Axes: xlabel='gender', ylabel='count'>

df["test preparation course"].value_counts()
# 1 denote not done and 0 denote the completed

test preparation course
1    642
0    358
Name: count, dtype: int64

labels = ["not completed","completed"]
color=["red","green"]
# plt.figure(facecolor="green")
plt.pie(df["test preparation course"].value_counts(),labels=labels,colors=color)

([<matplotlib.patches.Wedge at 0x16b168680>,
  <matplotlib.patches.Wedge at 0x16b1249e0>],
 [Text(-0.47460171119818767, 0.9923473261553901, 'not completed'),
  Text(0.4746018041084478, -0.9923472817199666, 'completed')])

plt.figure(figsize=(12,6))
sns.barplot(x="test preparation course",y="Average Score",data=df)
plt.show()
# 0 for done and 1 for not done so we can say who have done has better result

sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])
# plt.show()
# 1 is standard and 0 is free or reduced

/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/265420584.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x = df["lunch"],y=df["Average Score"],palette=["red","green"])

<Axes: xlabel='lunch', ylabel='Average Score'>

sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")
# 0=associate degree, 1=bachelor's degree,2=high school,3=master's degree,4=some college,5= some high school

/var/folders/yc/gs4729dn063_05_6p5wdk2280000gn/T/ipykernel_10443/3868579111.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=df['parental level of education'],y=df["Average Score"],data=df,palette="inferno")

<Axes: xlabel='parental level of education', ylabel='Average Score'>

plt.figure(figsize=(12,6))
sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x16b266480>

<Figure size 1200x600 with 0 Axes>

plt.figure(figsize=(12,6))
sns.heatmap(df.corr())

<Axes: >

# Lets divide into target varaibel and predictor
y=df["Average Score"]#Target score
x=df.drop(["Average Score","reading score","writing score","math score"],axis=1)
y.shape

(1000,)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
y_train.shape

(700,)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
lr=LinearRegression()
lr.fit(X_train,y_train)
y_predict=lr.predict(X_test)

y_predict

array([70.98127112, 66.27322644, 68.91170688, 56.90687812, 77.26157742,
       60.61150762, 64.47634728, 74.5734257 , 58.67457349, 71.00885117,
       53.91685753, 69.08094565, 67.15707412, 62.40838679, 75.32464328,
       78.44729397, 68.46978304, 60.16958378, 70.70858605, 64.17608215,
       66.09092506, 69.5359321 , 68.21016058, 61.35530033, 66.14059634,
       69.35363073, 76.65041481, 70.98127112, 65.22013999, 62.40838679,
       63.71206709, 75.32464328, 56.29571551, 69.5228695 , 69.5228695 ,
       58.53451852, 60.61150762, 71.15050989, 69.65549959, 60.02952881,
       75.15540451, 71.00885117, 76.65041481, 65.33970748, 60.61150762,
       81.70999963, 59.72765994, 70.09742344, 74.88271944, 73.96226309,
       71.45077501, 72.91820526, 73.38770915, 66.27322644, 68.77165191,
       71.15050989, 68.91170688, 67.41669659, 65.47976245, 72.63649156,
       69.07191703, 67.76823673, 70.98127112, 58.53451852, 67.02444402,
       69.5359321 , 62.40838679, 57.7907258 , 70.14709471, 72.91820526,
       79.02927279, 70.53934728, 56.90687812, 58.36527975, 72.94578531,
       76.81965358, 78.88921781, 56.73763935, 64.89778364, 65.20707738,
       58.84381226, 58.40188842, 66.97477275, 69.21357575, 65.03783861,
       61.35530033, 78.88921781, 74.5734257 , 76.51035983, 53.03300985,
       57.34880196, 73.08744403, 68.91170688, 55.85379167, 67.27664162,
       69.35363073, 67.58593536, 65.33970748, 70.98127112, 69.97785594,
       75.15540451, 69.5228695 , 63.54282832, 62.23914802, 62.65898063,
       64.59591477, 71.29056486, 66.53284891, 75.89919722, 70.09742344,
       63.73415831, 59.4183662 , 62.65898063, 66.66547901, 60.47145265,
       66.53284891, 64.2036622 , 76.20849097, 72.33462269, 67.46636786,
       57.03950822, 60.47145265, 68.0278592 , 67.76823673, 78.44729397,
       67.14401152, 80.65691318, 76.51035983, 65.47976245, 72.94578531,
       70.70858605, 69.35363073, 58.40188842, 81.26807579, 70.98127112,
       61.35530033, 78.2780552 , 71.00885117, 53.47493369, 58.36527975,
       66.11301629, 75.89919722, 66.14059634, 68.46978304, 74.88271944,
       60.30221388, 61.33320911, 67.14401152, 68.51945432, 73.97129171,
       81.70999963, 76.78304491, 67.41669659, 56.46495428, 71.59243373,
       73.08744403, 63.29223447, 66.66547901, 69.84522584, 74.85513939,
       78.2780552 , 72.33462269, 78.88921781, 64.2036622 , 75.32464328,
       65.64900122, 65.52943373, 60.78074639, 78.2780552 , 69.04433698,
       71.00885117, 71.29056486, 71.91479008, 70.14709471, 58.84381226,
       61.96646295, 68.46978304, 68.65208442, 74.85513939, 76.51035983,
       68.1604893 , 61.35530033, 70.58901855, 64.2036622 , 69.96479334,
       70.40671718, 68.60241314, 62.0974893 , 66.53284891, 71.45077501,
       62.26672807, 63.27014324, 58.67457349, 59.86029004, 66.41328141,
       71.03094239, 79.02927279, 58.53451852, 70.98127112, 78.44729397,
       71.59243373, 69.5359321 , 62.40838679, 57.34880196, 70.40671718,
       55.85379167, 78.2780552 , 78.71997904, 76.20849097, 60.02952881,
       63.27014324, 60.47145265, 76.81965358, 68.77165191, 67.27664162,
       67.41669659, 69.35363073, 66.53284891, 78.88921781, 68.46978304,
       61.77513295, 64.34532092, 76.95228368, 74.27155683, 55.85379167,
       67.71856546, 71.59243373, 68.60241314, 61.96646295, 75.15540451,
       67.58593536, 62.65898063, 64.17608215, 62.53941314, 67.58593536,
       67.27664162, 67.41669659, 67.76823673, 66.83471777, 65.47976245,
       54.80070522, 65.64900122, 66.71515028, 70.98127112, 77.26157742,
       66.83471777, 58.67457349, 70.8396124 , 59.72765994, 79.02927279,
       60.77171777, 74.5734257 , 60.91337649, 74.85513939, 63.27014324,
       60.02952881, 67.15707412, 78.58734894, 68.46978304, 69.04433698,
       52.59108601, 69.21357575, 80.82615195, 70.53934728, 68.21016058,
       70.28714968, 69.5228695 , 69.04433698, 64.59591477, 75.15540451,
       58.36527975, 70.70858605, 68.0278592 , 69.07191703, 57.34880196,
       59.4183662 , 64.2036622 , 59.72765994, 55.85379167, 78.71997904,
       61.79722418, 66.11301629, 67.41669659, 67.41669659, 56.90687812,
       78.71997904, 71.15050989, 65.47976245, 68.46978304, 70.40671718,
       64.59591477, 68.46978304, 76.78304491, 70.8396124 , 69.21357575,
       65.22013999, 58.84381226, 52.59108601, 54.96994399, 76.95228368])

y_test

521    87.00000
737    64.00000
740    75.00000
660    74.66667
411    81.66667
         ...   
468    83.33333
935    63.66667
428    59.00000
7      40.66667
155    82.33333
Name: Average Score, Length: 300, dtype: float64

diff=abs(y_predict-y_test)
diff.mean()

10.671228279143728

	math score	reading score	writing score
count	1000.00000	1000.000000	1000.000000
mean	66.08900	69.169000	68.054000
std	15.16308	14.600192	15.195657
min	0.00000	17.000000	10.000000
25%	57.00000	59.000000	57.750000
50%	66.00000	70.000000	69.000000
75%	77.00000	79.000000	79.000000
max	100.00000	100.000000	100.000000

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75
5	female	group B	associate's degree	standard	none	71	83	78
6	female	group B	some college	standard	completed	88	95	92
7	male	group B	some college	free/reduced	none	40	43	39
8	male	group D	high school	free/reduced	completed	64	64	67
9	female	group B	high school	free/reduced	none	38	60	50
10	male	group C	associate's degree	standard	none	58	54	52
11	male	group D	associate's degree	standard	none	40	52	43
12	female	group B	high school	standard	none	65	81	73
13	male	group A	some college	standard	completed	78	72	70
14	female	group A	master's degree	standard	none	50	53	58
15	female	group C	some high school	standard	none	69	75	78
16	male	group C	high school	standard	none	88	89	86
17	female	group B	some high school	free/reduced	none	18	32	28
18	male	group C	master's degree	free/reduced	completed	46	42	46
19	female	group C	associate's degree	free/reduced	none	54	58	61

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score	Average Score
0	0	1	1	1	1	72	72	74	72.66667
1	0	2	4	1	0	69	90	88	82.33333
2	0	1	3	1	1	90	95	93	92.66667
3	1	0	0	0	1	47	57	44	49.33333
4	1	2	4	1	1	76	78	75	76.33333
5	0	1	0	1	1	71	83	78	77.33333
6	0	1	4	1	0	88	95	92	91.66667
7	1	1	4	0	1	40	43	39	40.66667
8	1	3	2	0	0	64	64	67	65.00000
9	0	1	2	0	1	38	60	50	49.33333
10	1	2	0	1	1	58	54	52	54.66667
11	1	3	0	1	1	40	52	43	45.00000
12	0	1	2	1	1	65	81	73	73.00000
13	1	0	4	1	0	78	72	70	73.33333
14	0	0	3	1	1	50	53	58	53.66667
15	0	2	5	1	1	69	75	78	74.00000
16	1	2	2	1	1	88	89	86	87.66667
17	0	1	5	0	1	18	32	28	26.00000
18	1	2	3	0	0	46	42	46	44.66667
19	0	2	0	0	1	54	58	61	57.66667