Nashville Scooter Company Report Card

Data Prep

In [1]:
import pandas as pd
import numpy as np

# 
# assign the path of the code-for-nashville open data portal on githup to a variable
# 
dataPath = 'https://raw.githubusercontent.com/code-for-nashville/open-data-portal/feature/scooter-2019-08-clean-up/nashville/scooter-data/'
# dataPath = 'https://raw.githubusercontent.com/code-for-nashville/open-data-portal/feature/scooter-extract/nashville/scooter-data/'

# 
# Make a list of all the files to download from the open data portal
# currently files for July 22 through August 5 are available
# 
fileNames = ['scooter_extract_2019-07-'+str(x)+'.csv' for x in range(22,32)]
fileNames = fileNames + ['scooter_extract_2019-08-0'+str(x)+'.csv' for x in range(1,6)]
# fileNames = ['scooter_extract_2019-07-20_to_2019-09-09.csv']


# 
# make a list of the columns for the facts table
# 
factColumns = ['availability_duration', 'availability_duration_seconds',
       'availability_start_date', 'availability_start_date_cst',
       'availability_start_time', 'availability_start_time_cst',
       'company_name', 'extract_date_cst',
       'extract_date_utc', 'extract_time_cst', 'extract_time_utc',
       'gps_latitude', 'gps_longitude', 'real_time_fare',
       'sumd_id']

# 
# make a list of the columns for the company dimension table and sumd dimension table
# 
companyColumns = ['company_name', 'company_phone', 'company_website']
sumdColumns = ['company_name', 'sumd_group', 'sumd_id', 'sumd_type']
In [2]:
%%time
# 
# load all the data files into a single dataframe
# this takes approximately 8 minutes to load these files
# 
rawData = pd.concat([pd.read_csv(dataPath+f) for f in fileNames], sort = False)
rawData = rawData[rawData['extract_date_cst'].between('2019-07-29', '2019-08-04')]
CPU times: user 26.6 s, sys: 6.83 s, total: 33.4 s
Wall time: 1min 45s
In [3]:
# ensure enough extracts per day (should be about 96 slices per day for one every 15 minutes)
rawData.groupby('extract_date_cst').extract_time_cst.nunique()
Out[3]:
extract_date_cst
2019-07-29    96
2019-07-30    96
2019-07-31    96
2019-08-01    96
2019-08-02    96
2019-08-03    95
2019-08-04    95
Name: extract_time_cst, dtype: int64
In [4]:
%%time
# 
# create fact and dimension tables
# 
rawData['company_name'] = [x.upper() for x in rawData['company_name']]
rawData['sumd_group'] = [x.upper() for x in rawData['sumd_group']]
company = rawData[companyColumns].drop_duplicates()
sumd = rawData[sumdColumns].drop_duplicates()
sumd = sumd[sumd['sumd_group']=='SCOOTER']
scooterFacts = rawData[rawData['sumd_group']=='SCOOTER']
scooterFacts = scooterFacts[factColumns]
CPU times: user 2.93 s, sys: 165 ms, total: 3.1 s
Wall time: 3.1 s
In [5]:
# 
# Create two new columns with the latitude and longitude rounded to 3 places
# Using this rounded location, will allow for scooters within about 350 ft of each other
# to appear in the same location, thus minimizing the number of unique locations.
# 
scooterFacts['latitude_rnd'] = round(scooterFacts['gps_latitude'], 3)
scooterFacts['longitude_rnd'] = round(scooterFacts['gps_longitude'], 3)

How many scooters does each company have in Nashville?

In [6]:
companyStats = sumd[['company_name', 'sumd_id']] \
                .groupby('company_name').count() \
                .reset_index() \
                .rename(columns={'company_name': 'Company', 'sumd_id': 'Number Of Scooters'})

companyStats
Out[6]:
Company Number Of Scooters
0 BIRD 1399
1 BOLT 193
2 GOTCHA 227
3 LIME 619
4 LYFT 684
5 SPIN 977
6 UBER 1000

What are the 25 most popular scooters?

The table below shows the 25 scooters that were reported in the most locations in a day. The numbers under the 'latitude_rnd' and 'longitude_rnd' columns represent the average number of locations on each day in the dataset.

In [7]:
numOfLocsPerDay = scooterFacts[['availability_start_date_cst', 'latitude_rnd', 'longitude_rnd', 'sumd_id']] \
                    .drop_duplicates() \
                    .groupby(['sumd_id', 'availability_start_date_cst']).count() - 1

avgLocsPerDay = numOfLocsPerDay.groupby('sumd_id').mean()

totLocs = numOfLocsPerDay.groupby('sumd_id').sum()

twtyfiveMostMovedScooters = avgLocsPerDay \
                            .sort_values(by='latitude_rnd', ascending = False) \
                            .head(25) \
                            .merge(sumd[['company_name', 'sumd_id']], on='sumd_id')
twtyfiveMostMovedScooters
Out[7]:
sumd_id latitude_rnd longitude_rnd company_name
0 PoweredLE73UC4RVRFYV 33.000000 33.000000 LIME
1 Powered5MTGA3HCRY7OD 23.000000 23.000000 LIME
2 Poweredb857d61d-4091-5c6f-99c5-51e7b759a711 19.500000 19.500000 UBER
3 Powered4fb3a717-8581-59c1-bd7f-8976e5cd1af1 19.000000 19.000000 UBER
4 Powered7da98694-c302-5000-96a0-bec6810a3716 17.000000 17.000000 UBER
5 Powered05e6c0ee-739d-50ef-bc42-5e131feeec80 17.000000 17.000000 UBER
6 PoweredDLDCRRLYUHP5Y 16.500000 16.500000 LIME
7 Poweredc0f24b4c-3ac7-578f-b1ce-84eae7be669c 15.500000 15.500000 UBER
8 Powered8b184f0f-c6af-5cdf-baf2-4640e485522f 15.333333 15.333333 UBER
9 Poweredbb60ef04-c719-597f-805f-20978964fd8c 15.000000 15.000000 UBER
10 Powered1c9168a2-88f4-5f80-aed8-bcc86013ed48 14.250000 14.250000 UBER
11 Poweredb88a05db-7bb8-5864-b4ca-83db99ac0d71 14.000000 14.000000 UBER
12 Powered JAQ5S 14.000000 14.000000 BIRD
13 Powered4ab8e972-1f73-5cc3-a96a-d38ab7afdc80 13.666667 13.666667 UBER
14 Powered80eae78f-0c7e-5a3b-8596-526af77d9e6d 13.500000 13.500000 UBER
15 Powered 6XYNH 13.500000 13.500000 BIRD
16 Powered1d36005a-8b21-549c-8c5a-90ee236b17a2 13.333333 13.333333 UBER
17 Powered9189d266-de92-50a4-bcbc-111c920c9c9f 13.000000 13.000000 UBER
18 Powered384e2ba8-3ae9-5147-b7ca-4d2355d7b85a 13.000000 13.000000 UBER
19 Powered4ee39c27-93c6-5f5a-a896-8e872425db0a 13.000000 13.000000 UBER
20 PoweredNU3TYLT5YIIXW 13.000000 13.000000 LIME
21 Powered85b72f09-9877-5609-a138-0a34015fb3ca 12.750000 12.750000 UBER
22 Powered22916947-8a9c-59a4-a90b-6f9441392e73 12.250000 12.250000 UBER
23 Poweredb21b3e3e-7bf6-555c-8834-26c9189a1c19 12.250000 12.250000 UBER
24 Powered72839f71-6cd6-5292-8540-64e768633430 12.200000 12.200000 UBER
In [8]:
companyStats = companyStats.merge( \
                                  totLocs[totLocs['latitude_rnd'] == 0] \
                                    .merge(sumd[['company_name', 'sumd_id']], on='sumd_id') \
                                    .groupby('company_name') \
                                    .count() \
                                    .reset_index()[['company_name', 'sumd_id']] \
                                    .rename(columns={'company_name': 'Company', 'sumd_id': 'Scooters Not Ridden'}) \
                                    ,on='Company')
In [9]:
companyStats['Active Scooters'] = companyStats['Number Of Scooters'] - companyStats['Scooters Not Ridden']
In [10]:
companyStats
Out[10]:
Company Number Of Scooters Scooters Not Ridden Active Scooters
0 BIRD 1399 18 1381
1 BOLT 193 45 148
2 GOTCHA 227 207 20
3 LIME 619 24 595
4 LYFT 684 28 656
5 SPIN 977 635 342
6 UBER 1000 614 386

Calculate the total number of rides per company (over the whole week)

In [11]:
companyStats = totLocs \
                .merge(sumd[['company_name', 'sumd_id']], on='sumd_id') \
                .groupby('company_name') \
                .sum() \
                .reset_index()[['company_name', 'latitude_rnd']] \
                .rename(columns={'company_name': 'Company', 'latitude_rnd': 'Total Rides'}) \
                .merge(companyStats, on='Company') \
                .sort_values(by=['Total Rides'], ascending = False)

companyStats = companyStats \
                .append(pd.Series(['TOTAL'], index=['Company']).append(companyStats.sum(numeric_only = True)), \
                        ignore_index = True)

companyStats['Avg Rides Per Active Scooter'] = companyStats['Total Rides'] / companyStats['Active Scooters']
companyStats['% Scooters Ridden'] = companyStats['Active Scooters'] / companyStats['Number Of Scooters']

Format Columns

In [12]:
columnFormats = {'Total Rides': '{:,d}',
                 'Number Of Scooters': '{:,d}',
                 '% Scooters Ridden': '{:.0%}',
                 'Active Scooters': '{:,d}',
                 'Avg Rides Per Active Scooter': '{:.2f}'}
companyStats.style.format(columnFormats)
Out[12]:
Company Total Rides Number Of Scooters Scooters Not Ridden Active Scooters Avg Rides Per Active Scooter % Scooters Ridden
0 BIRD 34,163 1,399 18 1,381 24.74 99%
1 LIME 18,226 619 24 595 30.63 96%
2 UBER 9,173 1,000 614 386 23.76 39%
3 LYFT 4,897 684 28 656 7.46 96%
4 SPIN 4,384 977 635 342 12.82 35%
5 BOLT 449 193 45 148 3.03 77%
6 GOTCHA 40 227 207 20 2.00 9%
7 TOTAL 71,332 5,099 1571 3,528 20.22 69%

Choose columns, add color bars

In [13]:
displayStats = companyStats[[
    'Company',
    'Total Rides',
    'Number Of Scooters',
    '% Scooters Ridden',
    'Avg Rides Per Active Scooter'
]]

displayStats.style.format(columnFormats).bar(color='lightblue')
Out[13]:
Company Total Rides Number Of Scooters % Scooters Ridden Avg Rides Per Active Scooter
0 BIRD 34,163 1,399 99% 24.74
1 LIME 18,226 619 96% 30.63
2 UBER 9,173 1,000 39% 23.76
3 LYFT 4,897 684 96% 7.46
4 SPIN 4,384 977 35% 12.82
5 BOLT 449 193 77% 3.03
6 GOTCHA 40 227 9% 2.00
7 TOTAL 71,332 5,099 69% 20.22
In [ ]: