-
Notifications
You must be signed in to change notification settings - Fork 7
I one-hot encoded the country and metro columns (in the preprocessing function), and I preprocessed the longitude and latitude columns. #101
base: master
Are you sure you want to change the base?
Changes from 3 commits
2ced654
f04aa04
a352a00
6f49dae
4b83074
734cb76
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ | |
| import json | ||
| import argparse | ||
| from sklearn import preprocessing | ||
| from sklearn import impute | ||
|
|
||
| _DATA_DIR = './processed_data' | ||
| _TRAIN = 'trainminusval_visits.csv' | ||
|
|
@@ -108,17 +109,85 @@ def preprocess(self, do_val_split=True): | |
| for df, df_labels in dfs: | ||
| df_out = pd.DataFrame({'visitorId': df['fullVisitorId'].unique()}) | ||
| df_out.set_index('visitorId', inplace=True) | ||
|
|
||
| # Preprocessing operations go here. | ||
| df_out['log_sum_revenue'] = self._make_log_sum_revenue(df) | ||
| df_out['encoding_medium'], df_out['encoding_referralPath'], df_out['encoding_source'] = self._make_traffic_source_preprocessing(df) | ||
| df_out['encoding_campaign'], df_out['encoding_isTrueDirect'], df_out['encoding_keyword'] = self._another_traffic_source_preprocessing(df) | ||
| df_out = df_out.join(self._make_browser_preprocessing()) | ||
| df_out = df_out.join(self._preprocess_deviceCategory()) | ||
| df_out = df_out.join(self._make_browser_preprocessing(df)) | ||
| df_out = df_out.join(self._preprocess_deviceCategory(df)) | ||
| df_out['geoNetwork.first_x_longitude'], df_out['geoNetwork.last_x_longitude'], df_out['geoNetwork.first_y_longitude'], df_out['geoNetwork.last_y_longitude'] = self._preprocess_longitudes_and_latitudes(df) | ||
| df_out = df_out.join(self._preprocess_country(df)) | ||
| df_out = df_out.join(self._preprocess_metro(df)) | ||
| dfs_out.append((df_out, df_labels)) | ||
|
|
||
| return dfs_out | ||
|
|
||
| def _preprocess_country(self, df): | ||
| # One-hot encode the countries. | ||
| new_country_col = pd.Series(df['geoNetwork.country']) | ||
| country_dummies_df = pd.get_dummies(new_country_col) | ||
|
|
||
| return country_dummies_df | ||
|
|
||
| def _preprocess_metro(self, df): | ||
| # One-hot encode the metropolitan areas. | ||
| new_metro_col = pd.Series(df['geoNetwork.metro']) | ||
| metro_dummies_df = pd.get_dummies(new_metro_col) | ||
|
|
||
| return metro_dummies_df | ||
|
|
||
| def _preprocess_longitudes_and_latitudes(self, df): | ||
| """Preprocesses the columns u'geoNetwork.latitude', | ||
| u'geoNetwork.longitude', and u'geoNetwork.metro' in the dataset train.csv. | ||
| Creates columns of the first and last x and y longitudes and latitudes (4 columns) of each visitor. | ||
| Also creates dummy columns of the country and metropolitan area of each visit. | ||
|
|
||
| Returns: | ||
| The training dataframe (indexed by visitor) with the added longitude/latitude columns. | ||
| """ | ||
|
|
||
| train_df = df.copy(deep=False) | ||
|
|
||
| # Preprocess the numeric columns. Group by visitor and standardize, impute missing values, and normalize | ||
|
|
||
| # Impute the missing values in latitudes and longitudes. | ||
| imp = impute.SimpleImputer(missing_values='not available in demo dataset', strategy='mean') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not able to import sklearn.impute, maybe it's my version. But regardless, would it be possible to do this with pandas to avoid more dependencies?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, is imputation the right thing to do here? You will need to impute some value, but I think it would make more sense to impute after doing your transformation to x,y coordinates. Also, you should also consider adding a binary "missing" column so we can keep track of which values were imputed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One question about imputing -- is it ok to replace the unknown x and y values with 0? Because the visitor could still be at a specific location on Earth with all values equal to 0.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think imputing with zero is fine, but you should include the extra column to mark which values have been imputed. Otherwise, you will have no way of telling apart the ones that are imputed and the ones that are actually zero.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you -- I understand. |
||
| train_df['imputed_latitude'] = imp.transform(train_df['geoNetwork.latitude']) | ||
| train_df['imputed_longitude'] = imp.transform(train_df['geoNetwork.longitude']) | ||
|
|
||
| # Convert longitude and latitude into x and y coordinates. | ||
| # First convert from degrees to radians, then take sin and cosine. | ||
| train_df['x_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180) | ||
| train_df['x_longitude'] = numpy.cos(train_df['x_longitude']) | ||
| train_df['y_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180) | ||
| train_df['y_longitude'] = numpy.sin(train_df['y_longitude']) | ||
|
|
||
| train_df['x_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180) | ||
| train_df['x_latitude'] = numpy.cos(train_df['x_latitude']) | ||
| train_df['y_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180) | ||
| train_df['y_latitude'] = numpy.sin(train_df['y_latitude']) | ||
|
|
||
| # Sort by date. | ||
| train_df = train_df.sort_values(by = ['date']) | ||
|
jonathancstroud marked this conversation as resolved.
|
||
|
|
||
| # Goup by Visitor ID. | ||
| train_gdf = train_df.groupby('fullVisitorId') | ||
|
|
||
| # First, use the train_df data to convert long & lat from degrees to radians, then take sin and cosine. | ||
| # Then, group by fullVisitorID, and for each Visitor as a row, create a column with the Visitor's first | ||
| # and last x and y longitudes and latitudes. | ||
|
|
||
| # The first Longitudes of each visitor are in train_gdf['x_longitude'].first() | ||
| # The last Longitudes of each visitor are in train_gdf['x_longitude'].last() | ||
| # The first Latitudes of each visitor are in train_gdf['x_latitude'].first() | ||
| # The last Latitudes of each visitor are in train_gdf['x_latitude'].last() | ||
| df['geoNetwork.first_x_longitude'] = train_gdf['x_longitude'].first() | ||
| df['geoNetwork.last_x_longitude'] = train_gdf['x_longitude'].last() | ||
| df['geoNetwork.first_y_longitude'] = train_gdf['y_latitude'].first() | ||
| df['geoNetwork.last_y_longitude'] = train_gdf['y_latitude'].last() | ||
|
|
||
| return train_gdf['x_longitude'].first(), train_gdf['x_longitude'].last(), train_gdf['y_latitude'].first(), train_gdf['y_latitude'].last() | ||
|
|
||
| def _make_log_sum_revenue(self, df): | ||
| """Create the log_sum_revenue column. | ||
|
|
||
|
|
@@ -185,15 +254,15 @@ def _another_traffic_source_preprocessing(self, df): | |
| train_gdf = train_df.groupby('fullVisitorId') | ||
| return train_gdf['encoding_campaign'].sum(), train_gdf['encoding_isTrueDirect'].sum(), train_gdf['encoding_keyword'].sum() | ||
|
|
||
| def _make_browser_preprocessing(self): | ||
| def _make_browser_preprocessing(self, df): | ||
| """Creates the encoding columns of device.browser, device.browserSize, device.browserVersion | ||
|
|
||
| Returns: | ||
| A Dataframe containing one hot encoded columns for unique values of device.browser, | ||
| device.browserSize, device.browserVersion | ||
|
|
||
| """ | ||
| train_df = self.train.copy(deep=False) | ||
| train_df = df.copy(deep=False) | ||
| browser = self._one_hot('device.browser') | ||
| browserSize = self._one_hot('device.browserSize') | ||
| browserVersion = self._one_hot('device.browserVersion') | ||
|
|
@@ -222,7 +291,7 @@ def _make_json_converter(self, column_name): | |
| """Helper function to interpret columns in PANDAS.""" | ||
| return lambda x: {column_name: json.loads(x)} | ||
|
|
||
| def _preprocess_deviceCategory(self): | ||
| def _preprocess_deviceCategory(self, df): | ||
| """ Creates one hot encoding columns for the device.deviceCategory | ||
| args: | ||
| self: the google analytics Dataset | ||
|
|
@@ -233,7 +302,7 @@ def _preprocess_deviceCategory(self): | |
| """ | ||
|
|
||
| # Obtain list of device categories from training set | ||
| train_df = self.train.copy(deep = False).set_index('fullVisitorId') | ||
| train_df = df.copy(deep = False).set_index('fullVisitorId') | ||
| deviceCategory = train_df['device.deviceCategory'].fillna('missing') | ||
|
|
||
| # Create one hot encoding | ||
|
|
@@ -261,5 +330,5 @@ def _preprocess_deviceCategory(self): | |
| else: | ||
| assert num_train == _NUM_ROWS_TRAIN, 'Incorrect number of training examples found.' | ||
| assert num_test == _NUM_ROWS_TEST, 'Incorrect number of test examples found.' | ||
|
|
||
| print('Successfully loaded the dataset.') | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please clarify the meaning of "x and y longitudes and latitudes"