From ce8902d9004bb1bcf891bd19405d6602b548b44e Mon Sep 17 00:00:00 2001
From: Oriana <orianac@uw.edu>
Date: Tue, 8 Feb 2022 19:54:44 +0000
Subject: [PATCH 1/2] changes for model training and inference

---
 carbonplan_trace/v1/load.py          |   4 +-
 notebooks/processing/inference.ipynb | 221 +++++++++++++++++---
 notebooks/processing/model.ipynb     | 297 +++++++++------------------
 3 files changed, 288 insertions(+), 234 deletions(-)

diff --git a/carbonplan_trace/v1/load.py b/carbonplan_trace/v1/load.py
index 48415f7..1bcc3bc 100644
--- a/carbonplan_trace/v1/load.py
+++ b/carbonplan_trace/v1/load.py
@@ -229,13 +229,13 @@ def biomass(tiles, year):
 
 
 def training(realm, y0=2003, y1=2010, reload=False, access_key_id=None, secret_access_key=None):
-    output_filename = f's3://carbonplan-climatetrace/v1/training/{realm}/all_data.parquet'
+    output_filename = f's3://carbonplan-climatetrace/v2/training/{realm}/all_data.parquet'
     if fs.exists(output_filename) and not reload:
         return pd.read_parquet(output_filename)
     else:
         output = []
         for yr in range(y0, y1):
-            folder_name = f's3://carbonplan-climatetrace/v1/training/{realm}/{yr}/'
+            folder_name = f's3://carbonplan-climatetrace/v2/training/{realm}/{yr}/'
             files = fs.ls(folder_name)
             for f in files:
                 output.append(pd.read_parquet(f's3://{f}'))
diff --git a/notebooks/processing/inference.ipynb b/notebooks/processing/inference.ipynb
index 0f103fe..9db759b 100644
--- a/notebooks/processing/inference.ipynb
+++ b/notebooks/processing/inference.ipynb
@@ -50,6 +50,17 @@
     "from carbonplan_trace.v1 import utils\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyproj\n",
+    "\n",
+    "pyproj.__version__"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -88,7 +99,8 @@
     "    # spin up local cluster. must be on big enough machine\n",
     "    from dask.distributed import Client\n",
     "\n",
-    "    client = Client(n_workers=2, threads_per_worker=15, resources={\"workertoken\": 1})\n",
+    "    # when very very huge use 8,8\n",
+    "    client = Client(n_workers=8, threads_per_worker=8, resources={\"workertoken\": 1})\n",
     "    client\n",
     "else:\n",
     "    gateway = Gateway()\n",
@@ -107,15 +119,6 @@
     "#     cluster.scale(100)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cluster"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -145,6 +148,17 @@
     "        cluster.shutdown()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "shutdown_cluster(\"local\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -171,6 +185,20 @@
     "tiles and write it out to a mapper with those specifications.\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ul_lats = [\"10S\", \"20S\", \"30S\"]\n",
+    "ul_lons = [f\"{lon}E\" for lon in np.arange(110, 151, 10)]\n",
+    "lat_lon_tags = []\n",
+    "for ul_lat in ul_lats:\n",
+    "    for ul_lon in ul_lons:\n",
+    "        lat_lon_tags.append((ul_lat, ul_lon))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -182,11 +210,12 @@
     "    \"palladium/production/s3fs-public/atoms/files/\"\n",
     "    \"WRS2_descending_0.zip\"\n",
     ")\n",
-    "bucket = \"s3://carbonplan-climatetrace/v1\"\n",
+    "bucket = \"s3://carbonplan-climatetrace/v2.1\"\n",
     "\n",
-    "biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n",
-    "biomass_files = fs.ls(biomass_folder)\n",
-    "lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n",
+    "# biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n",
+    "# biomass_files = fs.ls(biomass_folder) # just to get list of lat_lon tiles we want\n",
+    "# lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n",
+    "# lat_lon_tags = [('60N', '130W')]#, ('40N', '130W')]#, ('00N', '060W')] #('50N', '130W'),\n",
     "bounding_boxes = [utils.parse_bounding_box_from_lat_lon_tags(lat, lon) for lat, lon in lat_lon_tags]"
    ]
   },
@@ -199,10 +228,11 @@
     "from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS\n",
     "\n",
     "processed_scenes = []\n",
-    "for year in np.arange(2014, 2021):\n",
-    "    processed_scenes.extend(fs.ls(f\"{bucket}/inference/rf/{year}\", recursive=True))\n",
+    "for year in np.arange(2011, 2022):\n",
+    "    processed_scenes.extend(fs.ls(f\"{bucket}/inference/xg/{year}\", recursive=True))\n",
     "\n",
-    "processed_scenes = [scene[-19:-8] for scene in processed_scenes]"
+    "processed_scenes = [scene[-19:-8] for scene in processed_scenes]\n",
+    "len(processed_scenes)"
    ]
   },
   {
@@ -211,7 +241,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(processed_scenes)"
+    "import carbonplan_trace"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n",
+    "table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n"
    ]
   },
   {
@@ -220,7 +258,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(processed_scenes) - 57875"
+    "for bounding_box in bounding_boxes:\n",
+    "    min_lat, max_lat, min_lon, max_lon = bounding_box\n",
+    "    valid_scenes = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values"
    ]
   },
   {
@@ -229,15 +269,54 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(bounding_boxes)"
+    "file_lengths = pd.DataFrame(\n",
+    "    columns=[\"v1-rf\", \"v2-rf\", \"v2-xg\"],\n",
+    "    index=[\"_\".join([str(path), str(row)]) for (path, row) in valid_scenes],\n",
+    ")"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# rerun_scenes = {'2010':[], '2014':[]}\n",
+    "# setups = [('v2', 'rf')]#, ('v2', 'xg')] #('v1', 'rf'),\n",
+    "# for year in ['2010', '2014']:\n",
+    "#     for (version, model) in setups:\n",
+    "#         for [path, row] in valid_scenes:\n",
+    "#             output_name = f\"{year}/{path:03d}{row:03d}.parquet\"\n",
+    "#             print(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')\n",
+    "#             if len(fs.ls(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')) == 0:\n",
+    "#                 if [path, row] not in rerun_scenes[year]:\n",
+    "#                     rerun_scenes[year].append([path, row])\n",
+    "#         i+=1\n",
+    "#             file_length = len(pd.read_parquet(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}'))\n",
+    "#         except FileNotFoundError:\n",
+    "#             file_length = np.nan\n",
+    "\n",
+    "#         file_lengths.loc[f'{path}_{row}', f'{version}-{model}'] = file_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n",
-    "table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n"
+    "# file_lengths.to_csv('files_to_repeat.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove each entry in index"
    ]
   },
   {
@@ -249,15 +328,14 @@
    "outputs": [],
    "source": [
     "landsat_bucket = \"s3://usgs-landsat/collection02/level-2/standard/etm/{}/{:03d}/{:03d}/\"\n",
-    "\n",
     "with rio.Env(aws_session):\n",
-    "    #     tasks = []\n",
+    "    tasks = []\n",
     "    task_ids = []\n",
     "    for bounding_box in bounding_boxes:\n",
     "        print(bounding_box)\n",
     "        min_lat, max_lat, min_lon, max_lon = bounding_box\n",
     "        scenes_in_tile = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values\n",
-    "        for year in np.arange(2014, 2021):\n",
+    "        for year in np.arange(2011, 2022):\n",
     "            for [path, row] in scenes_in_tile:\n",
     "                scene_stores = fs.ls(landsat_bucket.format(year, path, row))\n",
     "                output_name = f\"{year}/{path:03d}{row:03d}\"\n",
@@ -265,9 +343,11 @@
     "                    continue\n",
     "                elif output_name in processed_scenes:\n",
     "                    continue\n",
+    "                elif output_name in task_id:\n",
+    "                    continue\n",
     "                else:\n",
     "                    tasks.append(\n",
-    "                        # predict(\n",
+    "                        #                         predict(\n",
     "                        client.compute(\n",
     "                            predict_delayed(\n",
     "                                model_folder=f\"{bucket}/models/\",\n",
@@ -281,7 +361,7 @@
     "                            resources={\"workertoken\": 1},\n",
     "                        )\n",
     "                    )\n",
-    "                    task_ids.append([path, row, year, max_lat, min_lon])"
+    "                    task_id.append(output_name)"
    ]
   },
   {
@@ -292,7 +372,7 @@
    },
    "outputs": [],
    "source": [
-    "len(tasks)"
+    "len(rerun_scenes[\"2014\"])"
    ]
   },
   {
@@ -307,6 +387,15 @@
     "results"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -320,8 +409,8 @@
     "# row = task_id[i][1]\n",
     "# year = task_id[i][2]\n",
     "\n",
-    "path = 93\n",
-    "row = 11\n",
+    "path = 48\n",
+    "row = 22\n",
     "year = 2014\n",
     "\n",
     "print(path, row, year)\n",
@@ -337,6 +426,72 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/xg/2014/054018.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# i = 0\n",
+    "# path = task_id[i][0]\n",
+    "# row = task_id[i][1]\n",
+    "# year = task_id[i][2]\n",
+    "\n",
+    "path = 54\n",
+    "row = 18\n",
+    "year = 2010\n",
+    "\n",
+    "print(path, row, year)\n",
+    "\n",
+    "predict(\n",
+    "    model_folder=f\"{bucket}/models/\",\n",
+    "    path=path,\n",
+    "    row=row,\n",
+    "    year=year,\n",
+    "    access_key_id=access_key_id,\n",
+    "    secret_access_key=secret_access_key,\n",
+    "    output_write_bucket=f\"{bucket}/inference\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.read_parquet(\"s3://carbonplan-climatetrace/v2/inference/rf/2010/054018.parquet\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -404,9 +559,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python [conda env:notebook] *",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-notebook-py"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/notebooks/processing/model.ipynb b/notebooks/processing/model.ipynb
index 63daad8..1d13bc0 100644
--- a/notebooks/processing/model.ipynb
+++ b/notebooks/processing/model.ipynb
@@ -45,7 +45,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "realms = list(REALM_GROUPINGS.keys())"
+    "# we train one model per realm\n",
+    "\n",
+    "# realms = list(REALM_GROUPINGS.keys())\n",
+    "# only use australia for example, but we would want all when rerunning this\n",
+    "realms = [\"australia\"]"
    ]
   },
   {
@@ -55,43 +59,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# HPO\n",
-    "import itertools\n",
-    "\n",
-    "\n",
-    "def product_dict(**kwargs):\n",
-    "    keys = kwargs.keys()\n",
-    "    vals = kwargs.values()\n",
-    "    for instance in itertools.product(*vals):\n",
-    "        yield dict(zip(keys, instance))\n",
-    "\n",
-    "\n",
-    "param_set = {\n",
-    "    \"learning_rate\": [0.07, 0.05, 0.03],\n",
-    "    \"max_depth\": [10, 12, 14],\n",
-    "    \"colsample_bytree\": [0.5, 0.7, 0.9],\n",
-    "    \"subsample\": [0.5, 0.7, 0.9],\n",
-    "    \"min_child_weight\": [2, 4, 6],\n",
-    "    \"lambda\": [1, 1.5, 2],\n",
-    "    \"alpha\": [0, 0.5, 1],\n",
-    "    \"gamma\": [0, 0.5, 1],\n",
-    "}\n",
-    "\n",
-    "groupings = [\n",
-    "    [\"learning_rate\"],\n",
-    "    [\"max_depth\"],\n",
-    "    [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n",
-    "    [\"lambda\", \"alpha\", \"gamma\"],\n",
-    "]\n",
-    "\n",
-    "dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n",
-    "param_set_list = []\n",
-    "for orders in list(itertools.product(*dims)):\n",
-    "    d = {}\n",
-    "    for o, g in zip(orders, groupings):\n",
-    "        for k in g:\n",
-    "            d[k] = param_set[k][o]\n",
-    "    param_set_list.append(d)"
+    "# This block of code is used for generating difference parameter sets for hyperparameter optimization (HPO) of the model\n",
+    "# the params here are for the xgboost model\n",
+    "\n",
+    "# import itertools\n",
+    "\n",
+    "# def product_dict(**kwargs):\n",
+    "#     keys = kwargs.keys()\n",
+    "#     vals = kwargs.values()\n",
+    "#     for instance in itertools.product(*vals):\n",
+    "#         yield dict(zip(keys, instance))\n",
+    "\n",
+    "\n",
+    "# param_set = {\n",
+    "#     \"learning_rate\": [0.07, 0.05, 0.03],\n",
+    "#     \"max_depth\": [10, 12, 14],\n",
+    "#     \"colsample_bytree\": [0.5, 0.7, 0.9],\n",
+    "#     \"subsample\": [0.5, 0.7, 0.9],\n",
+    "#     \"min_child_weight\": [2, 4, 6],\n",
+    "#     \"lambda\": [1, 1.5, 2],\n",
+    "#     \"alpha\": [0, 0.5, 1],\n",
+    "#     \"gamma\": [0, 0.5, 1],\n",
+    "# }\n",
+    "\n",
+    "# groupings = [\n",
+    "#     [\"learning_rate\"],\n",
+    "#     [\"max_depth\"],\n",
+    "#     [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n",
+    "#     [\"lambda\", \"alpha\", \"gamma\"],\n",
+    "# ]\n",
+    "\n",
+    "# dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n",
+    "# param_set_list = []\n",
+    "# for orders in list(itertools.product(*dims)):\n",
+    "#     d = {}\n",
+    "#     for o, g in zip(orders, groupings):\n",
+    "#         for k in g:\n",
+    "#             d[k] = param_set[k][o]\n",
+    "#     param_set_list.append(d)"
    ]
   },
   {
@@ -101,6 +106,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# helper functions for assessing model performances\n",
+    "\n",
+    "\n",
     "def get_all_prediction_result(model, df_train, df_test, df_val):\n",
     "\n",
     "    df_train[\"biomass_pred\"] = model._predict(df_train)\n",
@@ -124,7 +132,21 @@
     "    mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()\n",
     "    me = (merged.biomass_year2 - merged.biomass_year1).mean()\n",
     "\n",
-    "    return {\"mae\": mae, \"me\": me}"
+    "    return {\"mae\": mae, \"me\": me}\n",
+    "\n",
+    "\n",
+    "def plot_scatter(sub, title, n=500000):\n",
+    "    xmin = -10\n",
+    "    size = min(len(sub), n)\n",
+    "    toplot = sub.sample(n=size)\n",
+    "    xmax = toplot.biomass.quantile(0.95)\n",
+    "    plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n",
+    "    plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n",
+    "    plt.xlabel(\"True Biomass (Mg/ha)\")\n",
+    "    plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n",
+    "    plt.xlim(xmin, xmax)\n",
+    "    plt.ylim(xmin, xmax)\n",
+    "    plt.title(title)"
    ]
   },
   {
@@ -137,11 +159,16 @@
    "outputs": [],
    "source": [
     "scores = []\n",
-    "random_split = False\n",
+    "# whether to randomly split the train/test data or to split train/test based on year\n",
+    "# doesn't seem to make too big of a difference on validation performance\n",
+    "random_split = True\n",
+    "# whether to reload the training data from individual years, or use the compiled data directly\n",
+    "# only needs to be True when the training data is re-generated\n",
     "reload = False\n",
+    "# whether to overwrite the models already trained\n",
     "overwrite = False\n",
     "\n",
-    "for model_class in [m.random_forest_model]:  # m.xgb_model\n",
+    "for model_class in [m.random_forest_model, m.xgb_model]:\n",
     "    for realm in realms:\n",
     "        print(f\"Building model for {realm} realm\")\n",
     "\n",
@@ -154,8 +181,10 @@
     "        )\n",
     "        print(f\"    size of entire df is {round(df.size / 1e9, 2)}Gb\")\n",
     "\n",
-    "        for strategy in [\"last\"]:  # [\"first\", \"last\", \"no\"]:\n",
-    "            # split into train/test based on year\n",
+    "        for strategy in [\"none\"]:  # [\"first\", \"last\", \"none\"]:\n",
+    "            # strategy = \"first\" means that the first year is used for validation, and \"last\" means the last year is used for validation\n",
+    "            # strategy = none means that no data is reserved for validation => used for training the final production model,\n",
+    "            # whereas first/last allow us to assess model performance during the model design and tuning phases\n",
     "            df_train, df_test, df_val = m.train_test_split_based_on_year(\n",
     "                df, val_strategy=strategy, random_train_test=random_split\n",
     "            )\n",
@@ -163,23 +192,24 @@
     "            print(f\"    testing sample size = {len(df_test)}\")\n",
     "            print(f\"    eval sample size = {len(df_val)}\")\n",
     "\n",
-    "            # build 2 models: 1) baseline/mean, 2) xgboost\n",
-    "            # TODO: build linear model as another baseline model\n",
-    "            # m.baseline_model, m.gradient_boost_model, m.random_forest_model\n",
-    "\n",
+    "            # this for loop is for running different parameter sets in HPO\n",
     "            for params in [{}]:\n",
     "\n",
+    "                # instantiating the model also does .fit\n",
+    "                # this will load the model if it already exist and overwrite=False, and fit the model if overwrite=True or the model does not exist\n",
     "                model = model_class(\n",
     "                    realm=realm,\n",
     "                    df_train=df_train,\n",
     "                    df_test=df_test,\n",
-    "                    output_folder=\"s3://carbonplan-climatetrace/v1/models/\",\n",
+    "                    output_folder=\"s3://carbonplan-climatetrace/v2.1/models/\",  # v1 or v2\n",
     "                    overwrite=overwrite,\n",
-    "                    validation_year=\"none\",\n",
+    "                    validation_year=strategy,\n",
     "                    params=params,\n",
     "                )\n",
     "\n",
+    "                # do model evaluation on each split of the data: train, test, and validation\n",
     "                for split, sub in zip((\"train\", \"test\", \"val\"), (df_train, df_test, df_val)):\n",
+    "                    # validation data can be empty if val strategy = 'none'\n",
     "                    if len(sub) > 0:\n",
     "                        model_score = model.evaluate(sub)\n",
     "                        model_score[\"model_name\"] = model.name\n",
@@ -194,6 +224,7 @@
     "                df_train[\"biomass_pred\"] = model.predict(df_train)\n",
     "                df_test[\"biomass_pred\"] = model.predict(df_test)\n",
     "\n",
+    "            # plot the prediction result\n",
     "            plt.figure(figsize=(10, 4.5))\n",
     "            plt.subplot(1, 2, 1)\n",
     "            plot_scatter(df_train, title=f\"{realm} train samples\")\n",
@@ -203,49 +234,21 @@
     "            plt.show()\n",
     "            plt.close()\n",
     "\n",
-    "            plt.figure(figsize=(10, 4))\n",
-    "            plt.title(f\"{realm} feature importance\")\n",
-    "            xticks = np.arange(len(m.features)) * 2\n",
-    "            plt.bar(xticks, model.model.feature_importances_)\n",
-    "            plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n",
-    "            plt.savefig(f\"{realm}_feature_imp.png\")\n",
-    "            plt.show()\n",
-    "            plt.close()\n",
+    "            # plotting feature importance if the model being trained is random forest\n",
+    "            if \"rf\" in model.name:\n",
+    "                plt.figure(figsize=(10, 4))\n",
+    "                plt.title(f\"{realm} feature importance\")\n",
+    "                xticks = np.arange(len(m.features)) * 2\n",
+    "                plt.bar(xticks, model.model.feature_importances_)\n",
+    "                plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n",
+    "                plt.savefig(f\"{realm}_feature_imp.png\")\n",
+    "                plt.show()\n",
+    "                plt.close()\n",
+    "            # TODO: plot something else if we're training the xgboost model\n",
     "\n",
     "scores = pd.DataFrame(scores)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e6519687-11f4-41e0-b8fe-0191fecc98ea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_scatter(sub, title, n=500000):\n",
-    "    xmin = -10\n",
-    "    size = min(len(sub), n)\n",
-    "    toplot = sub.sample(n=size)\n",
-    "    xmax = toplot.biomass.quantile(0.95)\n",
-    "    plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n",
-    "    plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n",
-    "    plt.xlabel(\"True Biomass (Mg/ha)\")\n",
-    "    plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n",
-    "    plt.xlim(xmin, xmax)\n",
-    "    plt.ylim(xmin, xmax)\n",
-    "    plt.title(title)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "527a24c3-477f-4023-8816-3f2cb8d91ba3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_train.year.unique()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -253,7 +256,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scores"
+    "scores\n",
+    "\n",
+    "# only selecting everything that's test or val split\n",
+    "# scores.loc[scores.split == 'val]\n",
+    "\n",
+    "# doing weighted average of the scores\n",
+    "# (scores.loc[scores.split == 'test'].r2 * scores.loc[scores.split == 'test'].sample_size).sum() / scores.loc[scores.split == 'test'].sample_size.sum()"
    ]
   },
   {
@@ -291,116 +300,6 @@
     "    sub = scores.loc[(scores.split == \"train\") & (scores.validation_year == validation_year)]\n",
     "    print(f\"training score   = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bea208ad-daa1-4811-bbe0-cbc6e1ff75dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "temporal_variability = pd.read_csv(\"temporal_variability.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd8e436c-3a67-4541-9772-3ca633323102",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "temporal_variability[\"realm\"] = temporal_variability.model_name.apply(lambda x: x.split(\"_\")[1])\n",
-    "temporal_variability[\"model_type\"] = temporal_variability.model_name.apply(\n",
-    "    lambda x: x.split(\"_\")[0]\n",
-    ")\n",
-    "\n",
-    "sample_size = (\n",
-    "    scores.loc[(scores.random_split == True) & (scores.model_name.str.startswith(\"xgb\"))]\n",
-    "    .groupby(\"realm\")\n",
-    "    .sample_size.sum()\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "716e1e97-4f6c-4609-8f2f-bfd7bcd6bdba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# weighted average\n",
-    "merged = temporal_variability.loc[temporal_variability.random_split != True].merge(\n",
-    "    sample_size, how=\"left\", on=\"realm\"\n",
-    ")\n",
-    "name_dict = {\n",
-    "    \"gb\": \"gradient boosting\",\n",
-    "    \"ground\": \"lidar derived\",\n",
-    "    \"rf\": \"random forest\",\n",
-    "    \"xgb\": \"xgboost\",\n",
-    "}\n",
-    "merged[\"model_type\"] = merged.model_type.apply(lambda x: name_dict[x])\n",
-    "\n",
-    "print(\n",
-    "    \"Biomass MAE between years 2007 and 2008 of the same location using different model architecture\"\n",
-    ")\n",
-    "print(\"\")\n",
-    "for model, g in merged.groupby(\"model_type\"):\n",
-    "    print(\n",
-    "        model.ljust(20),\n",
-    "        np.round((g.mae * g.sample_size).sum() / g.sample_size.sum(), 4),\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "90400beb-c2bc-443f-9aa8-a2e76a57a9b8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# simple average\n",
-    "temporal_variability.loc[temporal_variability.random_split != True].merge(\n",
-    "    sample_size, how=\"left\", on=\"realm\"\n",
-    ").groupby(\"model_type\").mae.mean()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f3498408-2ad8-4aed-a8bb-f6d722ba025a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "scores = pd.read_csv(\"HPO_1.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6e5d4987-11af-4f8b-bc48-965bb7838310",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.loc[df.split == \"test\"].groupby(\n",
-    "    [\"learning_rate\", \"max_depth\", \"colsample_bytree\", \"lambda\"]\n",
-    ").mean().sort_values(by=\"r2\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea82ac5c-5cbb-4b87-b739-f98249a73b02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# from sklearn.preprocessing import OneHotEncoder\n",
-    "# igbp_encoder = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore').fit(df_train[['igbp']])\n",
-    "#     # one hot encoding for igbp\n",
-    "#     encoded_igbp = igbp_encoder.transform(X[['igbp']])\n",
-    "#     X = X.drop(['igbp'], axis=1)\n",
-    "#     for i in range(encoded_igbp.shape[1]):\n",
-    "#         X[f'igbp_cat_{str(i+1)}'] = encoded_igbp[:, i]"
-   ]
   }
  ],
  "metadata": {

From 2fbd2f7afb07eb10de9fdcb0174ddf64b82e4734 Mon Sep 17 00:00:00 2001
From: Oriana <orianac@uw.edu>
Date: Tue, 8 Feb 2022 21:08:42 +0000
Subject: [PATCH 2/2] add year column into the data frame for model training

---
 carbonplan_trace/v1/load.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/carbonplan_trace/v1/load.py b/carbonplan_trace/v1/load.py
index 1bcc3bc..2f9e07b 100644
--- a/carbonplan_trace/v1/load.py
+++ b/carbonplan_trace/v1/load.py
@@ -235,10 +235,13 @@ def training(realm, y0=2003, y1=2010, reload=False, access_key_id=None, secret_a
     else:
         output = []
         for yr in range(y0, y1):
+            print(yr)
             folder_name = f's3://carbonplan-climatetrace/v2/training/{realm}/{yr}/'
             files = fs.ls(folder_name)
             for f in files:
-                output.append(pd.read_parquet(f's3://{f}'))
+                single_df = pd.read_parquet(f's3://{f}')
+                single_df['year'] = yr
+                output.append(single_df)
         output = pd.concat(output)
         utils.write_parquet(output, output_filename, access_key_id, secret_access_key)
         return output