import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
In [2]:
''' reading dataset '''
df = pd.read_csv('water_potability.csv')
In [3]:
''' displaying first 5 rows '''
df.head()
Out[3]:
In [4]:
''' shape of data '''
df.shape
Out[4]:
In [5]:
''' checking null values '''
df.isnull().sum()
Out[5]:
In [6]:
''' checking info of data '''
df.info()
In [7]:
''' unique values in each column '''
for column in df.columns:
print("{} has {} unique values".format(column, len(df[column].unique())))
In [8]:
''' describing dataset '''
df.describe()
Out[8]:
In [12]:
''' value counts of Potability '''
pot_lbl = df.Potability.value_counts()
''' barplot '''
plt.figure(figsize=(8, 5))
sns.barplot(pot_lbl.index, pot_lbl);
plt.xlabel('Potability', fontsize=15)
plt.ylabel('count', fontsize=15);
In [13]:
''' correlation matrix '''
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Matrix', fontsize=20);
In [20]:
''' box plot '''
for column in df.columns[:-1]:
plt.figure(figsize=(10, 5))
sns.boxplot(df[column])
plt.title('Box plot of {}'.format(column), fontsize=20);
In [21]:
''' violin plot '''
for column in df.columns[:-1]:
plt.figure(figsize=(10, 5))
sns.violinplot(df[column])
plt.title('Violin plot of {}'.format(column), fontsize=20);
In [25]:
for freature in df.columns[:-1]:
plt.figure(figsize=(10, 5))
sns.histplot(df[column], kde=True)
plt.xlabel(column, fontsize=15)
plt.ylabel('count', fontsize=15)
plt.title('Histogram plot of {}'.format(column), fontsize=20);
In [34]:
''' percentage of null values in each column '''
for feature in df.columns:
print("{} \t {:.1f}% null values".format(feature, (df[feature].isnull().sum() / len(df)) * 100 ))
In [35]:
''' preparing data for model '''
ph_mean = df[df['Potability'] == 0]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['ph'].isna()), 'ph'] = ph_mean
ph_mean_1 = df[df['Potability'] == 1]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['ph'].isna()), 'ph'] = ph_mean_1
sulf_mean = df[df['Potability'] == 0]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Sulfate'].isna()), 'Sulfate'] = sulf_mean
sulf_mean_1 = df[df['Potability'] == 1]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Sulfate'].isna()), 'Sulfate'] = sulf_mean_1
traih_mean = df[df['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = traih_mean
trah_mean_1 = df[df['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = trah_mean_1
In [37]:
df.head()
Out[37]:
In [38]:
''' independent and dependent features '''
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
In [42]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [43]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
In [44]:
''' standard scaler '''
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [48]:
models_acc = []
models = [LogisticRegression(), KNeighborsClassifier(), RandomForestClassifier(), GaussianNB(), SVC()]
for model in models:
''' fit on data '''
model.fit(X_train, y_train)
pred = model.predict(X_test)
''' appending result to model_acc'''
models_acc.append(accuracy_score(y_test, pred))
In [53]:
''' creating dataframe '''
res = pd.DataFrame({
'Model Accuracy': models_acc,
"Model Name": ['LogisticRegression', 'KNeighborsClassifier', 'RandomForestClassifier', 'GaussianNB', 'SVC']
})
In [54]:
res
Out[54]:
In [57]:
''' barplot plot models accuracy '''
plt.figure(figsize=(10, 5))
sns.barplot(res['Model Accuracy'], res['Model Name'])
plt.xlabel('Model Accuracy', fontsize=15)
plt.ylabel('Model Name', fontsize=15);
In [ ]:
In [ ]:
In [ ]:
In [ ]:
No comments:
Post a Comment