Skip to content
This repository was archived by the owner on Jul 15, 2023. It is now read-only.
Open
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 77 additions & 8 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import argparse
from sklearn import preprocessing
from sklearn import impute

_DATA_DIR = './processed_data'
_TRAIN = 'trainminusval_visits.csv'
Expand Down Expand Up @@ -108,17 +109,85 @@ def preprocess(self, do_val_split=True):
for df, df_labels in dfs:
df_out = pd.DataFrame({'visitorId': df['fullVisitorId'].unique()})
df_out.set_index('visitorId', inplace=True)

# Preprocessing operations go here.
df_out['log_sum_revenue'] = self._make_log_sum_revenue(df)
df_out['encoding_medium'], df_out['encoding_referralPath'], df_out['encoding_source'] = self._make_traffic_source_preprocessing(df)
df_out['encoding_campaign'], df_out['encoding_isTrueDirect'], df_out['encoding_keyword'] = self._another_traffic_source_preprocessing(df)
df_out = df_out.join(self._make_browser_preprocessing())
df_out = df_out.join(self._preprocess_deviceCategory())
df_out = df_out.join(self._make_browser_preprocessing(df))
df_out = df_out.join(self._preprocess_deviceCategory(df))
df_out['geoNetwork.first_x_longitude'], df_out['geoNetwork.last_x_longitude'], df_out['geoNetwork.first_y_longitude'], df_out['geoNetwork.last_y_longitude'] = self._preprocess_longitudes_and_latitudes(df)
df_out = df_out.join(self._preprocess_country(df))
df_out = df_out.join(self._preprocess_metro(df))
dfs_out.append((df_out, df_labels))

return dfs_out

def _preprocess_country(self, df):
# One-hot encode the countries.
new_country_col = pd.Series(df['geoNetwork.country'])
country_dummies_df = pd.get_dummies(new_country_col)

return country_dummies_df

def _preprocess_metro(self, df):
# One-hot encode the metropolitan areas.
new_metro_col = pd.Series(df['geoNetwork.metro'])
metro_dummies_df = pd.get_dummies(new_metro_col)

return metro_dummies_df

def _preprocess_longitudes_and_latitudes(self, df):
"""Preprocesses the columns u'geoNetwork.latitude',
u'geoNetwork.longitude', and u'geoNetwork.metro' in the dataset train.csv.
Creates columns of the first and last x and y longitudes and latitudes (4 columns) of each visitor.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please clarify the meaning of "x and y longitudes and latitudes"

Also creates dummy columns of the country and metropolitan area of each visit.

Returns:
The training dataframe (indexed by visitor) with the added longitude/latitude columns.
"""

train_df = df.copy(deep=False)

# Preprocess the numeric columns. Group by visitor and standardize, impute missing values, and normalize

# Impute the missing values in latitudes and longitudes.
imp = impute.SimpleImputer(missing_values='not available in demo dataset', strategy='mean')

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not able to import sklearn.impute, maybe it's my version. But regardless, would it be possible to do this with pandas to avoid more dependencies?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, is imputation the right thing to do here? You will need to impute some value, but I think it would make more sense to impute after doing your transformation to x,y coordinates. Also, you should also consider adding a binary "missing" column so we can keep track of which values were imputed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One question about imputing -- is it ok to replace the unknown x and y values with 0? Because the visitor could still be at a specific location on Earth with all values equal to 0.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think imputing with zero is fine, but you should include the extra column to mark which values have been imputed. Otherwise, you will have no way of telling apart the ones that are imputed and the ones that are actually zero.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you -- I understand.

train_df['imputed_latitude'] = imp.transform(train_df['geoNetwork.latitude'])
train_df['imputed_longitude'] = imp.transform(train_df['geoNetwork.longitude'])

# Convert longitude and latitude into x and y coordinates.
# First convert from degrees to radians, then take sin and cosine.
train_df['x_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180)
train_df['x_longitude'] = numpy.cos(train_df['x_longitude'])
train_df['y_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180)
train_df['y_longitude'] = numpy.sin(train_df['y_longitude'])

train_df['x_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180)
train_df['x_latitude'] = numpy.cos(train_df['x_latitude'])
train_df['y_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180)
train_df['y_latitude'] = numpy.sin(train_df['y_latitude'])

# Sort by date.
train_df = train_df.sort_values(by = ['date'])
Comment thread
jonathancstroud marked this conversation as resolved.

# Goup by Visitor ID.
train_gdf = train_df.groupby('fullVisitorId')

# First, use the train_df data to convert long & lat from degrees to radians, then take sin and cosine.
# Then, group by fullVisitorID, and for each Visitor as a row, create a column with the Visitor's first
# and last x and y longitudes and latitudes.

# The first Longitudes of each visitor are in train_gdf['x_longitude'].first()
# The last Longitudes of each visitor are in train_gdf['x_longitude'].last()
# The first Latitudes of each visitor are in train_gdf['x_latitude'].first()
# The last Latitudes of each visitor are in train_gdf['x_latitude'].last()
df['geoNetwork.first_x_longitude'] = train_gdf['x_longitude'].first()
df['geoNetwork.last_x_longitude'] = train_gdf['x_longitude'].last()
df['geoNetwork.first_y_longitude'] = train_gdf['y_latitude'].first()
df['geoNetwork.last_y_longitude'] = train_gdf['y_latitude'].last()

return train_gdf['x_longitude'].first(), train_gdf['x_longitude'].last(), train_gdf['y_latitude'].first(), train_gdf['y_latitude'].last()

def _make_log_sum_revenue(self, df):
"""Create the log_sum_revenue column.

Expand Down Expand Up @@ -185,15 +254,15 @@ def _another_traffic_source_preprocessing(self, df):
train_gdf = train_df.groupby('fullVisitorId')
return train_gdf['encoding_campaign'].sum(), train_gdf['encoding_isTrueDirect'].sum(), train_gdf['encoding_keyword'].sum()

def _make_browser_preprocessing(self):
def _make_browser_preprocessing(self, df):
"""Creates the encoding columns of device.browser, device.browserSize, device.browserVersion

Returns:
A Dataframe containing one hot encoded columns for unique values of device.browser,
device.browserSize, device.browserVersion

"""
train_df = self.train.copy(deep=False)
train_df = df.copy(deep=False)
browser = self._one_hot('device.browser')
browserSize = self._one_hot('device.browserSize')
browserVersion = self._one_hot('device.browserVersion')
Expand Down Expand Up @@ -222,7 +291,7 @@ def _make_json_converter(self, column_name):
"""Helper function to interpret columns in PANDAS."""
return lambda x: {column_name: json.loads(x)}

def _preprocess_deviceCategory(self):
def _preprocess_deviceCategory(self, df):
""" Creates one hot encoding columns for the device.deviceCategory
args:
self: the google analytics Dataset
Expand All @@ -233,7 +302,7 @@ def _preprocess_deviceCategory(self):
"""

# Obtain list of device categories from training set
train_df = self.train.copy(deep = False).set_index('fullVisitorId')
train_df = df.copy(deep = False).set_index('fullVisitorId')
deviceCategory = train_df['device.deviceCategory'].fillna('missing')

# Create one hot encoding
Expand Down Expand Up @@ -261,5 +330,5 @@ def _preprocess_deviceCategory(self):
else:
assert num_train == _NUM_ROWS_TRAIN, 'Incorrect number of training examples found.'
assert num_test == _NUM_ROWS_TEST, 'Incorrect number of test examples found.'

print('Successfully loaded the dataset.')