MichiganDataScienceTeam · kahanaa · Nov 22, 2018 · Nov 26, 2018 · Nov 26, 2018 · Nov 29, 2018
@@ -6,6 +6,7 @@
 import json
 import argparse
 from sklearn import preprocessing
+from sklearn import impute
 
 _DATA_DIR = './processed_data'
 _TRAIN = 'trainminusval_visits.csv'
@@ -108,17 +109,85 @@ def preprocess(self, do_val_split=True):
         for df, df_labels in dfs:
             df_out = pd.DataFrame({'visitorId': df['fullVisitorId'].unique()})
             df_out.set_index('visitorId', inplace=True)
-
             # Preprocessing operations go here.
             df_out['log_sum_revenue'] = self._make_log_sum_revenue(df)
             df_out['encoding_medium'], df_out['encoding_referralPath'], df_out['encoding_source'] = self._make_traffic_source_preprocessing(df)
             df_out['encoding_campaign'], df_out['encoding_isTrueDirect'], df_out['encoding_keyword'] = self._another_traffic_source_preprocessing(df)
-            df_out = df_out.join(self._make_browser_preprocessing())
-            df_out = df_out.join(self._preprocess_deviceCategory())
+            df_out = df_out.join(self._make_browser_preprocessing(df))
+            df_out = df_out.join(self._preprocess_deviceCategory(df))
+            df_out['geoNetwork.first_x_longitude'], df_out['geoNetwork.last_x_longitude'], df_out['geoNetwork.first_y_longitude'], df_out['geoNetwork.last_y_longitude'] = self._preprocess_longitudes_and_latitudes(df)
+            df_out = df_out.join(self._preprocess_country(df))
+            df_out = df_out.join(self._preprocess_metro(df))
             dfs_out.append((df_out, df_labels))
 
         return dfs_out
 
+    def _preprocess_country(self, df):
+        # One-hot encode the countries.
+        new_country_col = pd.Series(df['geoNetwork.country'])
+        country_dummies_df = pd.get_dummies(new_country_col)
+
+        return country_dummies_df
+
+    def _preprocess_metro(self, df):
+        # One-hot encode the metropolitan areas.
+        new_metro_col = pd.Series(df['geoNetwork.metro'])
+        metro_dummies_df = pd.get_dummies(new_metro_col)
+
+        return metro_dummies_df
+
+    def _preprocess_longitudes_and_latitudes(self, df):
+        """Preprocesses the columns u'geoNetwork.latitude',
+           u'geoNetwork.longitude', and u'geoNetwork.metro' in the dataset train.csv.
+           Creates columns of the first and last x and y longitudes and latitudes (4 columns) of each visitor.
+           Also creates dummy columns of the country and metropolitan area of each visit.
+
+           Returns:
+               The training dataframe (indexed by visitor) with the added longitude/latitude columns.
+        """
+
+        train_df = df.copy(deep=False)
+
+        # Preprocess the numeric columns. Group by visitor and standardize, impute missing values, and normalize
+
+        # Impute the missing values in latitudes and longitudes.
+        imp = impute.SimpleImputer(missing_values='not available in demo dataset', strategy='mean')
+        train_df['imputed_latitude'] = imp.transform(train_df['geoNetwork.latitude'])
+        train_df['imputed_longitude'] = imp.transform(train_df['geoNetwork.longitude'])
+
+        # Convert longitude and latitude into x and y coordinates.
+        # First convert from degrees to radians, then take sin and cosine.
+        train_df['x_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180)
+        train_df['x_longitude'] = numpy.cos(train_df['x_longitude'])
+        train_df['y_longitude'] = train_df['geoNetwork.longitude'] * (math.pi / 180)
+        train_df['y_longitude'] = numpy.sin(train_df['y_longitude'])
+
+        train_df['x_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180)
+        train_df['x_latitude'] = numpy.cos(train_df['x_latitude'])
+        train_df['y_latitude'] = train_df['geoNetwork.latitude'] * (math.pi / 180)
+        train_df['y_latitude'] = numpy.sin(train_df['y_latitude'])
+
+        # Sort by date.
+        train_df = train_df.sort_values(by = ['date'])
+
+        # Goup by Visitor ID.
+        train_gdf = train_df.groupby('fullVisitorId')
+
+        # First, use the train_df data to convert long & lat from degrees to radians, then take sin and cosine.
+        # Then, group by fullVisitorID, and for each Visitor as a row, create a column with the Visitor's first
+        # and last x and y longitudes and latitudes.
+
+        # The first Longitudes of each visitor are in train_gdf['x_longitude'].first()
+        # The last  Longitudes of each visitor are in train_gdf['x_longitude'].last()
+        # The first Latitudes of each visitor are in train_gdf['x_latitude'].first()
+        # The last  Latitudes of each visitor are in train_gdf['x_latitude'].last()
+        df['geoNetwork.first_x_longitude'] = train_gdf['x_longitude'].first()
+        df['geoNetwork.last_x_longitude'] = train_gdf['x_longitude'].last()
+        df['geoNetwork.first_y_longitude'] = train_gdf['y_latitude'].first()
+        df['geoNetwork.last_y_longitude'] = train_gdf['y_latitude'].last()
+
+        return train_gdf['x_longitude'].first(), train_gdf['x_longitude'].last(), train_gdf['y_latitude'].first(), train_gdf['y_latitude'].last()
+
     def _make_log_sum_revenue(self, df):
         """Create the log_sum_revenue column.
 
@@ -185,15 +254,15 @@ def _another_traffic_source_preprocessing(self, df):
         train_gdf = train_df.groupby('fullVisitorId')
         return train_gdf['encoding_campaign'].sum(), train_gdf['encoding_isTrueDirect'].sum(), train_gdf['encoding_keyword'].sum()
 
-    def _make_browser_preprocessing(self):
+    def _make_browser_preprocessing(self, df):
         """Creates the encoding columns of device.browser, device.browserSize, device.browserVersion
 
         Returns:
             A Dataframe containing one hot encoded columns for unique values of device.browser,
             device.browserSize, device.browserVersion
 
         """
-        train_df = self.train.copy(deep=False)
+        train_df = df.copy(deep=False)
         browser = self._one_hot('device.browser')
         browserSize = self._one_hot('device.browserSize')
         browserVersion = self._one_hot('device.browserVersion')
@@ -222,7 +291,7 @@ def _make_json_converter(self, column_name):
         """Helper function to interpret columns in PANDAS."""
         return lambda x: {column_name: json.loads(x)}
 
-    def _preprocess_deviceCategory(self):
+    def _preprocess_deviceCategory(self, df):
         """ Creates one hot encoding columns for the device.deviceCategory
         args:
             self: the google analytics Dataset
@@ -233,7 +302,7 @@ def _preprocess_deviceCategory(self):
         """
 
         # Obtain list of device categories from training set
-        train_df = self.train.copy(deep = False).set_index('fullVisitorId')
+        train_df = df.copy(deep = False).set_index('fullVisitorId')
         deviceCategory = train_df['device.deviceCategory'].fillna('missing')
 
         # Create one hot encoding
@@ -261,5 +330,5 @@ def _preprocess_deviceCategory(self):
     else:
         assert num_train == _NUM_ROWS_TRAIN, 'Incorrect number of training examples found.'
         assert num_test == _NUM_ROWS_TEST, 'Incorrect number of test examples found.'
-
+    
     print('Successfully loaded the dataset.')