Drinking Water Potability


  import pandas as pd

import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 
import warnings 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
In [2]:
''' reading dataset '''
df = pd.read_csv('water_potability.csv')
In [3]:
''' displaying first 5 rows '''
df.head()
Out[3]:
phHardnessSolidsChloraminesSulfateConductivityOrganic_carbonTrihalomethanesTurbidityPotability
0NaN204.89045520791.3189817.300212368.516441564.30865410.37978386.9909702.9631350
13.716080129.42292118630.0578586.635246NaN592.88535915.18001356.3290764.5006560
28.099124224.23625919909.5417329.275884NaN418.60621316.86863766.4200933.0559340
38.316766214.37339422018.4174418.059332356.886136363.26651618.436524100.3416744.6287710
49.092223181.10150917978.9863396.546600310.135738398.41081311.55827931.9979934.0750750
In [4]:
''' shape of data '''
df.shape
Out[4]:
(3276, 10)
In [5]:
''' checking null values '''
df.isnull().sum()
Out[5]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [6]:
''' checking info of data '''
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [7]:
''' unique values in each column '''
for column in df.columns:
    print("{} has {} unique values".format(column, len(df[column].unique())))
ph has 2786 unique values
Hardness has 3276 unique values
Solids has 3276 unique values
Chloramines has 3276 unique values
Sulfate has 2496 unique values
Conductivity has 3276 unique values
Organic_carbon has 3276 unique values
Trihalomethanes has 3115 unique values
Turbidity has 3276 unique values
Potability has 2 unique values
In [8]:
''' describing dataset '''
df.describe()
Out[8]:
phHardnessSolidsChloraminesSulfateConductivityOrganic_carbonTrihalomethanesTurbidityPotability
count2785.0000003276.0000003276.0000003276.0000002495.0000003276.0000003276.0000003114.0000003276.0000003276.000000
mean7.080795196.36949622014.0925267.122277333.775777426.20511114.28497066.3962933.9667860.390110
std1.59432032.8797618768.5708281.58308541.41684080.8240643.30816216.1750080.7803820.487849
min0.00000047.432000320.9426110.352000129.000000181.4837542.2000000.7380001.4500000.000000
25%6.093092176.85053815666.6902976.127421307.699498365.73441412.06580155.8445363.4397110.000000
50%7.036752196.96762720927.8336077.130299333.073546421.88496814.21833866.6224853.9550280.000000
75%8.062066216.66745627332.7621278.114887359.950170481.79230416.55765277.3374734.5003201.000000
max14.000000323.12400061227.19600813.127000481.030642753.34262028.300000124.0000006.7390001.000000
In [12]:
''' value counts of Potability '''
pot_lbl = df.Potability.value_counts()

''' barplot '''
plt.figure(figsize=(8, 5))
sns.barplot(pot_lbl.index, pot_lbl);
plt.xlabel('Potability', fontsize=15)
plt.ylabel('count', fontsize=15);
In [13]:
''' correlation matrix '''
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Matrix', fontsize=20);
In [20]:
''' box plot '''
for column in df.columns[:-1]:
    plt.figure(figsize=(10, 5))
    sns.boxplot(df[column])
    plt.title('Box plot of {}'.format(column), fontsize=20);
In [21]:
''' violin plot '''
for column in df.columns[:-1]:
    plt.figure(figsize=(10, 5))
    sns.violinplot(df[column])
    plt.title('Violin plot of {}'.format(column), fontsize=20);
In [25]:
for freature in df.columns[:-1]:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[column], kde=True)
    plt.xlabel(column, fontsize=15)
    plt.ylabel('count', fontsize=15)
    plt.title('Histogram plot of {}'.format(column), fontsize=20);
In [34]:
''' percentage of null values in each column '''
for feature in df.columns:
    print("{} \t {:.1f}% null values".format(feature, (df[feature].isnull().sum() / len(df)) * 100 ))
ph 	 15.0% null values
Hardness 	 0.0% null values
Solids 	 0.0% null values
Chloramines 	 0.0% null values
Sulfate 	 23.8% null values
Conductivity 	 0.0% null values
Organic_carbon 	 0.0% null values
Trihalomethanes 	 4.9% null values
Turbidity 	 0.0% null values
Potability 	 0.0% null values
In [35]:
''' preparing data for model '''
ph_mean = df[df['Potability'] == 0]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['ph'].isna()), 'ph'] = ph_mean

ph_mean_1 = df[df['Potability'] == 1]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['ph'].isna()), 'ph'] = ph_mean_1

sulf_mean = df[df['Potability'] == 0]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Sulfate'].isna()), 'Sulfate'] = sulf_mean

sulf_mean_1 = df[df['Potability'] == 1]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Sulfate'].isna()), 'Sulfate'] = sulf_mean_1

traih_mean = df[df['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = traih_mean

trah_mean_1 = df[df['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = trah_mean_1
In [37]:
df.head()
Out[37]:
phHardnessSolidsChloraminesSulfateConductivityOrganic_carbonTrihalomethanesTurbidityPotability
07.085378204.89045520791.3189817.300212368.516441564.30865410.37978386.9909702.9631350
13.716080129.42292118630.0578586.635246334.564290592.88535915.18001356.3290764.5006560
28.099124224.23625919909.5417329.275884334.564290418.60621316.86863766.4200933.0559340
38.316766214.37339422018.4174418.059332356.886136363.26651618.436524100.3416744.6287710
49.092223181.10150917978.9863396.546600310.135738398.41081311.55827931.9979934.0750750
In [38]:
''' independent and dependent features '''
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
In [42]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [43]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
X_train shape:  (2620, 9)
X_test shape:  (656, 9)
In [44]:
''' standard scaler '''
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [48]:
models_acc = []

models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), GaussianNB(), SVC()]

for model in models:
    ''' fit on data '''
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    
    ''' appending result to model_acc'''
    models_acc.append(accuracy_score(y_test, pred))
In [53]:
''' creating dataframe '''
res = pd.DataFrame({
    'Model Accuracy': models_acc, 
    "Model Name": ['LogisticRegression', 'KNeighborsClassifier', 'RandomForestClassifier', 'GaussianNB', 'SVC']
})
In [54]:
res
Out[54]:
Model AccuracyModel Name
00.626524LogisticRegression
10.635671KNeighborsClassifier
20.800305RandomForestClassifier
30.615854GaussianNB
40.682927SVC
In [57]:
''' barplot plot models accuracy '''
plt.figure(figsize=(10, 5))
sns.barplot(res['Model Accuracy'], res['Model Name'])
plt.xlabel('Model Accuracy', fontsize=15)
plt.ylabel('Model Name', fontsize=15);
In [ ]:

In [ ]:

In [ ]:

In [ ]:

No comments:

Post a Comment