Uber Drivers EDA


 import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
In [18]:
'''reading dataset'''
df = pd.read_csv('My Uber Drives - 2016.csv')
In [19]:
''' displaying first 5 rows of dataset '''
df.head()
Out[19]:
START_DATE*END_DATE*CATEGORY*START*STOP*MILES*PURPOSE*
01/1/2016 21:111/1/2016 21:17BusinessFort PierceFort Pierce5.1Meal/Entertain
11/2/2016 1:251/2/2016 1:37BusinessFort PierceFort Pierce5.0NaN
21/2/2016 20:251/2/2016 20:38BusinessFort PierceFort Pierce4.8Errand/Supplies
31/5/2016 17:311/5/2016 17:45BusinessFort PierceFort Pierce4.7Meeting
41/6/2016 14:421/6/2016 15:49BusinessFort PierceWest Palm Beach63.7Customer Visit
In [20]:
'''columns in dataset '''
df.columns
Out[20]:
Index(['START_DATE*', 'END_DATE*', 'CATEGORY*', 'START*', 'STOP*', 'MILES*',
       'PURPOSE*'],
      dtype='object')
In [21]:
df.shape
Out[21]:
(1156, 7)
In [22]:
'''checking null values in dataset '''
df.isnull().sum()
Out[22]:
START_DATE*      0
END_DATE*        1
CATEGORY*        1
START*           1
STOP*            1
MILES*           0
PURPOSE*       503
dtype: int64
In [23]:
"""find duplicate rows"""
df[df.duplicated()]
Out[23]:
START_DATE*END_DATE*CATEGORY*START*STOP*MILES*PURPOSE*
4926/28/2016 23:346/28/2016 23:59BusinessDurhamCary9.9Meeting
In [24]:
## there is one duplicated row, we are gonna remove this duplicate row
df.drop_duplicates(inplace=True)
In [25]:
'''There we have 4 rows having starting and end-time exactly equal i.e. zero trip time, 
while having non-zero miles (distance) - which cannot be possible.'''

df.drop(df.index[[751, 761, 798, 807]], inplace=True)
In [29]:
''' Renaming column names '''
df.columns = ['START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES', 'PURPOSE']
In [35]:
''' Converting START_DATE and END_DATE into date time'''

df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')
In [36]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1151 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   START_DATE  1150 non-null   datetime64[ns]
 1   END_DATE    1150 non-null   datetime64[ns]
 2   CATEGORY    1150 non-null   object        
 3   START       1150 non-null   object        
 4   STOP        1150 non-null   object        
 5   MILES       1151 non-null   float64       
 6   PURPOSE     652 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 71.9+ KB
In [37]:
df.head()
Out[37]:
START_DATEEND_DATECATEGORYSTARTSTOPMILESPURPOSE
02016-01-01 21:11:002016-01-01 21:17:00BusinessFort PierceFort Pierce5.1Meal/Entertain
12016-01-02 01:25:002016-01-02 01:37:00BusinessFort PierceFort Pierce5.0NaN
22016-01-02 20:25:002016-01-02 20:38:00BusinessFort PierceFort Pierce4.8Errand/Supplies
32016-01-05 17:31:002016-01-05 17:45:00BusinessFort PierceFort Pierce4.7Meeting
42016-01-06 14:42:002016-01-06 15:49:00BusinessFort PierceWest Palm Beach63.7Customer Visit
In [40]:
''' Count Plot '''
plt.figure(figsize=(10, 5))
sns.countplot(df['CATEGORY']);
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [46]:
start_labels = df.START.value_counts().nlargest(10)
In [53]:
''' Bar Plot '''
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(start_labels.index, start_labels);
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [57]:
''' Bar Plot '''
stop_labels = df.STOP.value_counts().nlargest(10)
plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(stop_labels.index, stop_labels);
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [58]:
''' Extracting months from column START_DATE '''
df['MONTH'] = pd.DatetimeIndex(df['START_DATE']).month
In [65]:
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April', 5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug', 9.0: 'Sep',
      10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec' }
df["MONTH"] = df.MONTH.map(month_label)
In [67]:
df.MONTH.unique()
Out[67]:
array(['Jan', 'Feb', 'Mar', 'April', 'May', 'June', 'July', 'Aug', 'Sep',
       'Oct', 'Nov', 'Dec', nan], dtype=object)
In [68]:
df.head()
Out[68]:
START_DATEEND_DATECATEGORYSTARTSTOPMILESPURPOSEMONTH
02016-01-01 21:11:002016-01-01 21:17:00BusinessFort PierceFort Pierce5.1Meal/EntertainJan
12016-01-02 01:25:002016-01-02 01:37:00BusinessFort PierceFort Pierce5.0NaNJan
22016-01-02 20:25:002016-01-02 20:38:00BusinessFort PierceFort Pierce4.8Errand/SuppliesJan
32016-01-05 17:31:002016-01-05 17:45:00BusinessFort PierceFort Pierce4.7MeetingJan
42016-01-06 14:42:002016-01-06 15:49:00BusinessFort PierceWest Palm Beach63.7Customer VisitJan
In [73]:
''' Bar Plot '''
month_count = df.MONTH.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(month_count.index, month_count);
plt.xlabel('Months')
plt.ylabel('Value Counts');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [91]:
''' creating a dictonary that contains info about miles column '''
miles_dic = {}

for i in df.MILES:
    if i < 10:
        if '0-10 miles' not in miles_dic:
            miles_dic['0-10 miles'] = [i]
        else:
            miles_dic['0-10 miles'].append(i)
            
    elif i >= 10 and i < 20:
        if '10-20 miles' not in miles_dic:
            miles_dic['10-20 miles'] = [i]
        else:
            miles_dic['10-20 miles'].append(i)
            
    elif i >= 20 and i < 30:
        if '20-30 miles' not in miles_dic:
            miles_dic['20-30 miles'] = [i]
        else:
            miles_dic['20-30 miles'].append(i)
            
    elif i >= 30 and i < 40:
        if '30-40 miles' not in miles_dic:
            miles_dic['30-40 miles'] = [i]
        else:
            miles_dic['30-40 miles'].append(i)
            
    elif i >= 40 and i < 50:
        if '40-50 miles' not in miles_dic:
            miles_dic['40-50 miles'] = [i]
        else:
            miles_dic['40-50 miles'].append(i)
    else:
        if 'Above 50 miles' not in miles_dic:
            miles_dic['Above 50 miles'] = [i]
        else:
            miles_dic['Above 50 miles'].append(i)
In [96]:
len_miles = []
for key in miles_dic:
    len_miles.append((key, len(miles_dic[key])))
In [108]:
a, b = [], []
for i, j in len_miles:
    a.append(i)
    b.append(j)

plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(a, b)
plt.xlabel('Miles')
plt.ylabel('Count');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [109]:
df.head()
Out[109]:
START_DATEEND_DATECATEGORYSTARTSTOPMILESPURPOSEMONTH
02016-01-01 21:11:002016-01-01 21:17:00BusinessFort PierceFort Pierce5.1Meal/EntertainJan
12016-01-02 01:25:002016-01-02 01:37:00BusinessFort PierceFort Pierce5.0NaNJan
22016-01-02 20:25:002016-01-02 20:38:00BusinessFort PierceFort Pierce4.8Errand/SuppliesJan
32016-01-05 17:31:002016-01-05 17:45:00BusinessFort PierceFort Pierce4.7MeetingJan
42016-01-06 14:42:002016-01-06 15:49:00BusinessFort PierceWest Palm Beach63.7Customer VisitJan
In [130]:
''' Now we want to know how many trips were made on Day time and how many on Night time '''
t = pd.to_datetime(['18:00:00']).time
In [133]:
def check_time(tim):
    if t > tim:
        tim = 'DAY RIDE'
    else:
        tim = 'NIGHT RIDE'
    
df['DAY/NIGHT'] = df.apply(lambda x : 'NIGHT RIDE' if x['START_DATE'].time() > t else 'DAY RIDE', axis=1)
In [137]:
day_night_label = df['DAY/NIGHT'].value_counts()

''' Bar Plot '''
plt.figure(figsize=(10, 5))
sns.barplot(day_night_label.index, day_night_label);
plt.ylabel('COUNT')
plt.xlabel('DAY/NIGHT');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [141]:
df['DAY'] = df.START_DATE.dt.weekday
In [144]:
day_label = {
    0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thus', 4: 'Fir', 5: 'Sat', 6: 'Sun'
}
df['DAY'] = df['DAY'].map(day_label)
In [149]:
day_label = df.DAY.value_counts()
'''bar plot of days'''
plt.figure(figsize=(10, 5))
sns.barplot(day_label.index, day_label);
plt.xlabel('DAY')
plt.ylabel('COUNT');
C:\Users\jgaur\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [156]:
''' count plot of day'''
plt.figure(figsize=(10, 5))
sns.countplot(hue='CATEGORY', x='DAY', data=df);
In [ ]:

No comments:

Post a Comment