Merge Vehicle and Nonvehicle Dataset (1:1)
# Preprocess two raw datasets
novehicle = pd.read_csv(os.path.join("/content","novehicle.csv"), header = 0, keep_default_na = False)
vehicle = pd.read_csv(os.path.join("/content","metadata.csv"), header = 0, keep_default_na = False)
vehicle['label'] = '1'
novehicle['label'] = '0'
vehicle = vehicle.dropna(how = 'any', axis = 0)
novehicle = novehicle.dropna(how = 'any', axis = 0)
...
# Merge two datasets
capstone = vehicle.append(novehicle, ignore_index = True)
# Export new dataset to google drive and name it 'capstone'
from google.colab import drive
drive.mount('drive')
capstone.to_csv('capstone.csv')
!cp capstone.csv drive/My\ Drive/
...
# Check new dataset
Capstone = importfile(file_id = '1-1lVd9sPctCOOrpxJ8vpf7qf6zhU3EhN')
Capstone = pd.read_csv(os.path.join("/content","capstone.csv"), header = 0, keep_default_na = False)
Capstone.groupby('label').size()
Feature Engineering
- Check missing values by using missing_map function
# see missing values missing_map = Capstone.replace(['unknown_0','unknown_1'],np.nan) msno.matrix(missing_map)
- Format localhour from ‘object’ to ‘datetime’ dtype
capstone['localhour'] = pd.to_datetime(capstone['localhour']) capstone.localhour
- Create timeseries dataset
timeseries = captone[['dataid', 'localhour', 'car1', 'use', 'label']] timeseries.head()
- import weather dataset and create new feature ssd (by temperature, humidity, wind speed)
- import weather dataset
weather = importfile(file_id = '1V1qT6o65vB0XsSFs2a3GL1u6QCJTn0R9')
weather = pd.read_csv(os.path.join("/content","weather.csv"), header = 0, keep_default_na = False)
weather.head()
- using geopandas to create a GeoDataFrame with Coordinate column
weather['Coordinates'] = list(zip(weather.longitude, weather.latitude))
weather = weather[weather['Coordinates'] != (-117.15188500000001, 32.778033)]
weather['Coordinates'] = weather['Coordinates'].apply(Point)
gdf = geopandas.GeoDataFrame(weather, geometry='Coordinates')
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# We restrict to North America.
ax = world[world.continent == 'North America'].plot(
color='white', edgecolor='black')
# We can now plot our GeoDataFrame.
gdf.plot(ax=ax, color='red')
plt.show()
- create a new feature ssd (human comfort)
def ssd(row):
return float((1.818 * row['temperature']+ 18.18)*(0.88 + 0.002 * row['humidity'])+(row['temperature']- 32) / (45 - row['temperature'])- 3.2 * row['wind_speed'] + 18.2)
weather['ssd'] = weather.apply (lambda row: ssd(row), axis=1)
weather.head(5)
- left join capstone dataset and weather dataset on ‘localhour’
final_ssd = pd.merge(final,
weather_new[['localhour', 'state', 'ssd']],
on=['localhour','state'],
how = 'left')
final_ssd.ssd = pd.to_numeric(final_ssd.ssd, errors="coerce")
Finally I got the timesereis.csv and final_ssd.csv for later building timeseries and baseline models!