import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [18]:
'''reading dataset'''
df = pd.read_csv('My Uber Drives - 2016.csv')
In [19]:
''' displaying first 5 rows of dataset '''
df.head()
Out[19]:
In [20]:
'''columns in dataset '''
df.columns
Out[20]:
In [21]:
df.shape
Out[21]:
In [22]:
'''checking null values in dataset '''
df.isnull().sum()
Out[22]:
In [23]:
"""find duplicate rows"""
df[df.duplicated()]
Out[23]:
In [24]:
## there is one duplicated row, we are gonna remove this duplicate row
df.drop_duplicates(inplace=True)
In [25]:
'''There we have 4 rows having starting and end-time exactly equal i.e. zero trip time,
while having non-zero miles (distance) - which cannot be possible.'''
df.drop(df.index[[751, 761, 798, 807]], inplace=True)
In [29]:
''' Renaming column names '''
df.columns = ['START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES', 'PURPOSE']
In [35]:
''' Converting START_DATE and END_DATE into date time'''
df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')
In [36]:
df.info()
In [37]:
df.head()
Out[37]:
In [40]:
''' Count Plot '''
plt.figure(figsize=(10, 5))
sns.countplot(df['CATEGORY']);
In [46]:
start_labels = df.START.value_counts().nlargest(10)
In [53]:
''' Bar Plot '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(start_labels.index, start_labels);
plt.ylabel('Value Counts');
In [57]:
''' Bar Plot '''
stop_labels = df.STOP.value_counts().nlargest(10)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(stop_labels.index, stop_labels);
plt.ylabel('Value Counts');
In [58]:
''' Extracting months from column START_DATE '''
df['MONTH'] = pd.DatetimeIndex(df['START_DATE']).month
In [65]:
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April', 5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug', 9.0: 'Sep',
10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec' }
df["MONTH"] = df.MONTH.map(month_label)
In [67]:
df.MONTH.unique()
Out[67]:
In [68]:
df.head()
Out[68]:
In [73]:
''' Bar Plot '''
month_count = df.MONTH.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(month_count.index, month_count);
plt.xlabel('Months')
plt.ylabel('Value Counts');
In [91]:
''' creating a dictonary that contains info about miles column '''
miles_dic = {}
for i in df.MILES:
if i < 10:
if '0-10 miles' not in miles_dic:
miles_dic['0-10 miles'] = [i]
else:
miles_dic['0-10 miles'].append(i)
elif i >= 10 and i < 20:
if '10-20 miles' not in miles_dic:
miles_dic['10-20 miles'] = [i]
else:
miles_dic['10-20 miles'].append(i)
elif i >= 20 and i < 30:
if '20-30 miles' not in miles_dic:
miles_dic['20-30 miles'] = [i]
else:
miles_dic['20-30 miles'].append(i)
elif i >= 30 and i < 40:
if '30-40 miles' not in miles_dic:
miles_dic['30-40 miles'] = [i]
else:
miles_dic['30-40 miles'].append(i)
elif i >= 40 and i < 50:
if '40-50 miles' not in miles_dic:
miles_dic['40-50 miles'] = [i]
else:
miles_dic['40-50 miles'].append(i)
else:
if 'Above 50 miles' not in miles_dic:
miles_dic['Above 50 miles'] = [i]
else:
miles_dic['Above 50 miles'].append(i)
In [96]:
len_miles = []
for key in miles_dic:
len_miles.append((key, len(miles_dic[key])))
In [108]:
a, b = [], []
for i, j in len_miles:
a.append(i)
b.append(j)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(a, b)
plt.xlabel('Miles')
plt.ylabel('Count');
In [109]:
df.head()
Out[109]:
In [130]:
''' Now we want to know how many trips were made on Day time and how many on Night time '''
t = pd.to_datetime(['18:00:00']).time
In [133]:
def check_time(tim):
if t > tim:
tim = 'DAY RIDE'
else:
tim = 'NIGHT RIDE'
df['DAY/NIGHT'] = df.apply(lambda x : 'NIGHT RIDE' if x['START_DATE'].time() > t else 'DAY RIDE', axis=1)
In [137]:
day_night_label = df['DAY/NIGHT'].value_counts()
''' Bar Plot '''
plt.figure(figsize=(10, 5))
sns.barplot(day_night_label.index, day_night_label);
plt.ylabel('COUNT')
plt.xlabel('DAY/NIGHT');
In [141]:
df['DAY'] = df.START_DATE.dt.weekday
In [144]:
day_label = {
0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thus', 4: 'Fir', 5: 'Sat', 6: 'Sun'
}
df['DAY'] = df['DAY'].map(day_label)
In [149]:
day_label = df.DAY.value_counts()
'''bar plot of days'''
plt.figure(figsize=(10, 5))
sns.barplot(day_label.index, day_label);
plt.xlabel('DAY')
plt.ylabel('COUNT');
In [156]:
''' count plot of day'''
plt.figure(figsize=(10, 5))
sns.countplot(hue='CATEGORY', x='DAY', data=df);
In [ ]:
No comments:
Post a Comment