From ce8902d9004bb1bcf891bd19405d6602b548b44e Mon Sep 17 00:00:00 2001 From: Oriana Date: Tue, 8 Feb 2022 19:54:44 +0000 Subject: [PATCH 1/2] changes for model training and inference --- carbonplan_trace/v1/load.py | 4 +- notebooks/processing/inference.ipynb | 221 +++++++++++++++++--- notebooks/processing/model.ipynb | 297 +++++++++------------------ 3 files changed, 288 insertions(+), 234 deletions(-) diff --git a/carbonplan_trace/v1/load.py b/carbonplan_trace/v1/load.py index 48415f7..1bcc3bc 100644 --- a/carbonplan_trace/v1/load.py +++ b/carbonplan_trace/v1/load.py @@ -229,13 +229,13 @@ def biomass(tiles, year): def training(realm, y0=2003, y1=2010, reload=False, access_key_id=None, secret_access_key=None): - output_filename = f's3://carbonplan-climatetrace/v1/training/{realm}/all_data.parquet' + output_filename = f's3://carbonplan-climatetrace/v2/training/{realm}/all_data.parquet' if fs.exists(output_filename) and not reload: return pd.read_parquet(output_filename) else: output = [] for yr in range(y0, y1): - folder_name = f's3://carbonplan-climatetrace/v1/training/{realm}/{yr}/' + folder_name = f's3://carbonplan-climatetrace/v2/training/{realm}/{yr}/' files = fs.ls(folder_name) for f in files: output.append(pd.read_parquet(f's3://{f}')) diff --git a/notebooks/processing/inference.ipynb b/notebooks/processing/inference.ipynb index 0f103fe..9db759b 100644 --- a/notebooks/processing/inference.ipynb +++ b/notebooks/processing/inference.ipynb @@ -50,6 +50,17 @@ "from carbonplan_trace.v1 import utils\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pyproj\n", + "\n", + "pyproj.__version__" + ] + }, { "cell_type": "code", "execution_count": null, @@ -88,7 +99,8 @@ " # spin up local cluster. must be on big enough machine\n", " from dask.distributed import Client\n", "\n", - " client = Client(n_workers=2, threads_per_worker=15, resources={\"workertoken\": 1})\n", + " # when very very huge use 8,8\n", + " client = Client(n_workers=8, threads_per_worker=8, resources={\"workertoken\": 1})\n", " client\n", "else:\n", " gateway = Gateway()\n", @@ -107,15 +119,6 @@ "# cluster.scale(100)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cluster" - ] - }, { "cell_type": "code", "execution_count": null, @@ -145,6 +148,17 @@ " cluster.shutdown()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shutdown_cluster(\"local\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -171,6 +185,20 @@ "tiles and write it out to a mapper with those specifications.\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ul_lats = [\"10S\", \"20S\", \"30S\"]\n", + "ul_lons = [f\"{lon}E\" for lon in np.arange(110, 151, 10)]\n", + "lat_lon_tags = []\n", + "for ul_lat in ul_lats:\n", + " for ul_lon in ul_lons:\n", + " lat_lon_tags.append((ul_lat, ul_lon))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -182,11 +210,12 @@ " \"palladium/production/s3fs-public/atoms/files/\"\n", " \"WRS2_descending_0.zip\"\n", ")\n", - "bucket = \"s3://carbonplan-climatetrace/v1\"\n", + "bucket = \"s3://carbonplan-climatetrace/v2.1\"\n", "\n", - "biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n", - "biomass_files = fs.ls(biomass_folder)\n", - "lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n", + "# biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n", + "# biomass_files = fs.ls(biomass_folder) # just to get list of lat_lon tiles we want\n", + "# lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n", + "# lat_lon_tags = [('60N', '130W')]#, ('40N', '130W')]#, ('00N', '060W')] #('50N', '130W'),\n", "bounding_boxes = [utils.parse_bounding_box_from_lat_lon_tags(lat, lon) for lat, lon in lat_lon_tags]" ] }, @@ -199,10 +228,11 @@ "from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS\n", "\n", "processed_scenes = []\n", - "for year in np.arange(2014, 2021):\n", - " processed_scenes.extend(fs.ls(f\"{bucket}/inference/rf/{year}\", recursive=True))\n", + "for year in np.arange(2011, 2022):\n", + " processed_scenes.extend(fs.ls(f\"{bucket}/inference/xg/{year}\", recursive=True))\n", "\n", - "processed_scenes = [scene[-19:-8] for scene in processed_scenes]" + "processed_scenes = [scene[-19:-8] for scene in processed_scenes]\n", + "len(processed_scenes)" ] }, { @@ -211,7 +241,15 @@ "metadata": {}, "outputs": [], "source": [ - "len(processed_scenes)" + "import carbonplan_trace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n", + "table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n" ] }, { @@ -220,7 +258,9 @@ "metadata": {}, "outputs": [], "source": [ - "len(processed_scenes) - 57875" + "for bounding_box in bounding_boxes:\n", + " min_lat, max_lat, min_lon, max_lon = bounding_box\n", + " valid_scenes = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values" ] }, { @@ -229,15 +269,54 @@ "metadata": {}, "outputs": [], "source": [ - "len(bounding_boxes)" + "file_lengths = pd.DataFrame(\n", + " columns=[\"v1-rf\", \"v2-rf\", \"v2-xg\"],\n", + " index=[\"_\".join([str(path), str(row)]) for (path, row) in valid_scenes],\n", + ")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# rerun_scenes = {'2010':[], '2014':[]}\n", + "# setups = [('v2', 'rf')]#, ('v2', 'xg')] #('v1', 'rf'),\n", + "# for year in ['2010', '2014']:\n", + "# for (version, model) in setups:\n", + "# for [path, row] in valid_scenes:\n", + "# output_name = f\"{year}/{path:03d}{row:03d}.parquet\"\n", + "# print(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')\n", + "# if len(fs.ls(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')) == 0:\n", + "# if [path, row] not in rerun_scenes[year]:\n", + "# rerun_scenes[year].append([path, row])\n", + "# i+=1\n", + "# file_length = len(pd.read_parquet(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}'))\n", + "# except FileNotFoundError:\n", + "# file_length = np.nan\n", + "\n", + "# file_lengths.loc[f'{path}_{row}', f'{version}-{model}'] = file_length" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n", - "table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n" + "# file_lengths.to_csv('files_to_repeat.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove each entry in index" ] }, { @@ -249,15 +328,14 @@ "outputs": [], "source": [ "landsat_bucket = \"s3://usgs-landsat/collection02/level-2/standard/etm/{}/{:03d}/{:03d}/\"\n", - "\n", "with rio.Env(aws_session):\n", - " # tasks = []\n", + " tasks = []\n", " task_ids = []\n", " for bounding_box in bounding_boxes:\n", " print(bounding_box)\n", " min_lat, max_lat, min_lon, max_lon = bounding_box\n", " scenes_in_tile = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values\n", - " for year in np.arange(2014, 2021):\n", + " for year in np.arange(2011, 2022):\n", " for [path, row] in scenes_in_tile:\n", " scene_stores = fs.ls(landsat_bucket.format(year, path, row))\n", " output_name = f\"{year}/{path:03d}{row:03d}\"\n", @@ -265,9 +343,11 @@ " continue\n", " elif output_name in processed_scenes:\n", " continue\n", + " elif output_name in task_id:\n", + " continue\n", " else:\n", " tasks.append(\n", - " # predict(\n", + " # predict(\n", " client.compute(\n", " predict_delayed(\n", " model_folder=f\"{bucket}/models/\",\n", @@ -281,7 +361,7 @@ " resources={\"workertoken\": 1},\n", " )\n", " )\n", - " task_ids.append([path, row, year, max_lat, min_lon])" + " task_id.append(output_name)" ] }, { @@ -292,7 +372,7 @@ }, "outputs": [], "source": [ - "len(tasks)" + "len(rerun_scenes[\"2014\"])" ] }, { @@ -307,6 +387,15 @@ "results" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results" + ] + }, { "cell_type": "code", "execution_count": null, @@ -320,8 +409,8 @@ "# row = task_id[i][1]\n", "# year = task_id[i][2]\n", "\n", - "path = 93\n", - "row = 11\n", + "path = 48\n", + "row = 22\n", "year = 2014\n", "\n", "print(path, row, year)\n", @@ -337,6 +426,72 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/xg/2014/054018.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# i = 0\n", + "# path = task_id[i][0]\n", + "# row = task_id[i][1]\n", + "# year = task_id[i][2]\n", + "\n", + "path = 54\n", + "row = 18\n", + "year = 2010\n", + "\n", + "print(path, row, year)\n", + "\n", + "predict(\n", + " model_folder=f\"{bucket}/models/\",\n", + " path=path,\n", + " row=row,\n", + " year=year,\n", + " access_key_id=access_key_id,\n", + " secret_access_key=secret_access_key,\n", + " output_write_bucket=f\"{bucket}/inference\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.read_parquet(\"s3://carbonplan-climatetrace/v2/inference/rf/2010/054018.parquet\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -404,9 +559,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python [conda env:notebook] *", "language": "python", - "name": "python3" + "name": "conda-env-notebook-py" }, "language_info": { "codemirror_mode": { diff --git a/notebooks/processing/model.ipynb b/notebooks/processing/model.ipynb index 63daad8..1d13bc0 100644 --- a/notebooks/processing/model.ipynb +++ b/notebooks/processing/model.ipynb @@ -45,7 +45,11 @@ "metadata": {}, "outputs": [], "source": [ - "realms = list(REALM_GROUPINGS.keys())" + "# we train one model per realm\n", + "\n", + "# realms = list(REALM_GROUPINGS.keys())\n", + "# only use australia for example, but we would want all when rerunning this\n", + "realms = [\"australia\"]" ] }, { @@ -55,43 +59,44 @@ "metadata": {}, "outputs": [], "source": [ - "# HPO\n", - "import itertools\n", - "\n", - "\n", - "def product_dict(**kwargs):\n", - " keys = kwargs.keys()\n", - " vals = kwargs.values()\n", - " for instance in itertools.product(*vals):\n", - " yield dict(zip(keys, instance))\n", - "\n", - "\n", - "param_set = {\n", - " \"learning_rate\": [0.07, 0.05, 0.03],\n", - " \"max_depth\": [10, 12, 14],\n", - " \"colsample_bytree\": [0.5, 0.7, 0.9],\n", - " \"subsample\": [0.5, 0.7, 0.9],\n", - " \"min_child_weight\": [2, 4, 6],\n", - " \"lambda\": [1, 1.5, 2],\n", - " \"alpha\": [0, 0.5, 1],\n", - " \"gamma\": [0, 0.5, 1],\n", - "}\n", - "\n", - "groupings = [\n", - " [\"learning_rate\"],\n", - " [\"max_depth\"],\n", - " [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n", - " [\"lambda\", \"alpha\", \"gamma\"],\n", - "]\n", - "\n", - "dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n", - "param_set_list = []\n", - "for orders in list(itertools.product(*dims)):\n", - " d = {}\n", - " for o, g in zip(orders, groupings):\n", - " for k in g:\n", - " d[k] = param_set[k][o]\n", - " param_set_list.append(d)" + "# This block of code is used for generating difference parameter sets for hyperparameter optimization (HPO) of the model\n", + "# the params here are for the xgboost model\n", + "\n", + "# import itertools\n", + "\n", + "# def product_dict(**kwargs):\n", + "# keys = kwargs.keys()\n", + "# vals = kwargs.values()\n", + "# for instance in itertools.product(*vals):\n", + "# yield dict(zip(keys, instance))\n", + "\n", + "\n", + "# param_set = {\n", + "# \"learning_rate\": [0.07, 0.05, 0.03],\n", + "# \"max_depth\": [10, 12, 14],\n", + "# \"colsample_bytree\": [0.5, 0.7, 0.9],\n", + "# \"subsample\": [0.5, 0.7, 0.9],\n", + "# \"min_child_weight\": [2, 4, 6],\n", + "# \"lambda\": [1, 1.5, 2],\n", + "# \"alpha\": [0, 0.5, 1],\n", + "# \"gamma\": [0, 0.5, 1],\n", + "# }\n", + "\n", + "# groupings = [\n", + "# [\"learning_rate\"],\n", + "# [\"max_depth\"],\n", + "# [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n", + "# [\"lambda\", \"alpha\", \"gamma\"],\n", + "# ]\n", + "\n", + "# dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n", + "# param_set_list = []\n", + "# for orders in list(itertools.product(*dims)):\n", + "# d = {}\n", + "# for o, g in zip(orders, groupings):\n", + "# for k in g:\n", + "# d[k] = param_set[k][o]\n", + "# param_set_list.append(d)" ] }, { @@ -101,6 +106,9 @@ "metadata": {}, "outputs": [], "source": [ + "# helper functions for assessing model performances\n", + "\n", + "\n", "def get_all_prediction_result(model, df_train, df_test, df_val):\n", "\n", " df_train[\"biomass_pred\"] = model._predict(df_train)\n", @@ -124,7 +132,21 @@ " mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()\n", " me = (merged.biomass_year2 - merged.biomass_year1).mean()\n", "\n", - " return {\"mae\": mae, \"me\": me}" + " return {\"mae\": mae, \"me\": me}\n", + "\n", + "\n", + "def plot_scatter(sub, title, n=500000):\n", + " xmin = -10\n", + " size = min(len(sub), n)\n", + " toplot = sub.sample(n=size)\n", + " xmax = toplot.biomass.quantile(0.95)\n", + " plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n", + " plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n", + " plt.xlabel(\"True Biomass (Mg/ha)\")\n", + " plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n", + " plt.xlim(xmin, xmax)\n", + " plt.ylim(xmin, xmax)\n", + " plt.title(title)" ] }, { @@ -137,11 +159,16 @@ "outputs": [], "source": [ "scores = []\n", - "random_split = False\n", + "# whether to randomly split the train/test data or to split train/test based on year\n", + "# doesn't seem to make too big of a difference on validation performance\n", + "random_split = True\n", + "# whether to reload the training data from individual years, or use the compiled data directly\n", + "# only needs to be True when the training data is re-generated\n", "reload = False\n", + "# whether to overwrite the models already trained\n", "overwrite = False\n", "\n", - "for model_class in [m.random_forest_model]: # m.xgb_model\n", + "for model_class in [m.random_forest_model, m.xgb_model]:\n", " for realm in realms:\n", " print(f\"Building model for {realm} realm\")\n", "\n", @@ -154,8 +181,10 @@ " )\n", " print(f\" size of entire df is {round(df.size / 1e9, 2)}Gb\")\n", "\n", - " for strategy in [\"last\"]: # [\"first\", \"last\", \"no\"]:\n", - " # split into train/test based on year\n", + " for strategy in [\"none\"]: # [\"first\", \"last\", \"none\"]:\n", + " # strategy = \"first\" means that the first year is used for validation, and \"last\" means the last year is used for validation\n", + " # strategy = none means that no data is reserved for validation => used for training the final production model,\n", + " # whereas first/last allow us to assess model performance during the model design and tuning phases\n", " df_train, df_test, df_val = m.train_test_split_based_on_year(\n", " df, val_strategy=strategy, random_train_test=random_split\n", " )\n", @@ -163,23 +192,24 @@ " print(f\" testing sample size = {len(df_test)}\")\n", " print(f\" eval sample size = {len(df_val)}\")\n", "\n", - " # build 2 models: 1) baseline/mean, 2) xgboost\n", - " # TODO: build linear model as another baseline model\n", - " # m.baseline_model, m.gradient_boost_model, m.random_forest_model\n", - "\n", + " # this for loop is for running different parameter sets in HPO\n", " for params in [{}]:\n", "\n", + " # instantiating the model also does .fit\n", + " # this will load the model if it already exist and overwrite=False, and fit the model if overwrite=True or the model does not exist\n", " model = model_class(\n", " realm=realm,\n", " df_train=df_train,\n", " df_test=df_test,\n", - " output_folder=\"s3://carbonplan-climatetrace/v1/models/\",\n", + " output_folder=\"s3://carbonplan-climatetrace/v2.1/models/\", # v1 or v2\n", " overwrite=overwrite,\n", - " validation_year=\"none\",\n", + " validation_year=strategy,\n", " params=params,\n", " )\n", "\n", + " # do model evaluation on each split of the data: train, test, and validation\n", " for split, sub in zip((\"train\", \"test\", \"val\"), (df_train, df_test, df_val)):\n", + " # validation data can be empty if val strategy = 'none'\n", " if len(sub) > 0:\n", " model_score = model.evaluate(sub)\n", " model_score[\"model_name\"] = model.name\n", @@ -194,6 +224,7 @@ " df_train[\"biomass_pred\"] = model.predict(df_train)\n", " df_test[\"biomass_pred\"] = model.predict(df_test)\n", "\n", + " # plot the prediction result\n", " plt.figure(figsize=(10, 4.5))\n", " plt.subplot(1, 2, 1)\n", " plot_scatter(df_train, title=f\"{realm} train samples\")\n", @@ -203,49 +234,21 @@ " plt.show()\n", " plt.close()\n", "\n", - " plt.figure(figsize=(10, 4))\n", - " plt.title(f\"{realm} feature importance\")\n", - " xticks = np.arange(len(m.features)) * 2\n", - " plt.bar(xticks, model.model.feature_importances_)\n", - " plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n", - " plt.savefig(f\"{realm}_feature_imp.png\")\n", - " plt.show()\n", - " plt.close()\n", + " # plotting feature importance if the model being trained is random forest\n", + " if \"rf\" in model.name:\n", + " plt.figure(figsize=(10, 4))\n", + " plt.title(f\"{realm} feature importance\")\n", + " xticks = np.arange(len(m.features)) * 2\n", + " plt.bar(xticks, model.model.feature_importances_)\n", + " plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n", + " plt.savefig(f\"{realm}_feature_imp.png\")\n", + " plt.show()\n", + " plt.close()\n", + " # TODO: plot something else if we're training the xgboost model\n", "\n", "scores = pd.DataFrame(scores)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6519687-11f4-41e0-b8fe-0191fecc98ea", - "metadata": {}, - "outputs": [], - "source": [ - "def plot_scatter(sub, title, n=500000):\n", - " xmin = -10\n", - " size = min(len(sub), n)\n", - " toplot = sub.sample(n=size)\n", - " xmax = toplot.biomass.quantile(0.95)\n", - " plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n", - " plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n", - " plt.xlabel(\"True Biomass (Mg/ha)\")\n", - " plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n", - " plt.xlim(xmin, xmax)\n", - " plt.ylim(xmin, xmax)\n", - " plt.title(title)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "527a24c3-477f-4023-8816-3f2cb8d91ba3", - "metadata": {}, - "outputs": [], - "source": [ - "df_train.year.unique()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -253,7 +256,13 @@ "metadata": {}, "outputs": [], "source": [ - "scores" + "scores\n", + "\n", + "# only selecting everything that's test or val split\n", + "# scores.loc[scores.split == 'val]\n", + "\n", + "# doing weighted average of the scores\n", + "# (scores.loc[scores.split == 'test'].r2 * scores.loc[scores.split == 'test'].sample_size).sum() / scores.loc[scores.split == 'test'].sample_size.sum()" ] }, { @@ -291,116 +300,6 @@ " sub = scores.loc[(scores.split == \"train\") & (scores.validation_year == validation_year)]\n", " print(f\"training score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bea208ad-daa1-4811-bbe0-cbc6e1ff75dc", - "metadata": {}, - "outputs": [], - "source": [ - "temporal_variability = pd.read_csv(\"temporal_variability.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd8e436c-3a67-4541-9772-3ca633323102", - "metadata": {}, - "outputs": [], - "source": [ - "temporal_variability[\"realm\"] = temporal_variability.model_name.apply(lambda x: x.split(\"_\")[1])\n", - "temporal_variability[\"model_type\"] = temporal_variability.model_name.apply(\n", - " lambda x: x.split(\"_\")[0]\n", - ")\n", - "\n", - "sample_size = (\n", - " scores.loc[(scores.random_split == True) & (scores.model_name.str.startswith(\"xgb\"))]\n", - " .groupby(\"realm\")\n", - " .sample_size.sum()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "716e1e97-4f6c-4609-8f2f-bfd7bcd6bdba", - "metadata": {}, - "outputs": [], - "source": [ - "# weighted average\n", - "merged = temporal_variability.loc[temporal_variability.random_split != True].merge(\n", - " sample_size, how=\"left\", on=\"realm\"\n", - ")\n", - "name_dict = {\n", - " \"gb\": \"gradient boosting\",\n", - " \"ground\": \"lidar derived\",\n", - " \"rf\": \"random forest\",\n", - " \"xgb\": \"xgboost\",\n", - "}\n", - "merged[\"model_type\"] = merged.model_type.apply(lambda x: name_dict[x])\n", - "\n", - "print(\n", - " \"Biomass MAE between years 2007 and 2008 of the same location using different model architecture\"\n", - ")\n", - "print(\"\")\n", - "for model, g in merged.groupby(\"model_type\"):\n", - " print(\n", - " model.ljust(20),\n", - " np.round((g.mae * g.sample_size).sum() / g.sample_size.sum(), 4),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90400beb-c2bc-443f-9aa8-a2e76a57a9b8", - "metadata": {}, - "outputs": [], - "source": [ - "# simple average\n", - "temporal_variability.loc[temporal_variability.random_split != True].merge(\n", - " sample_size, how=\"left\", on=\"realm\"\n", - ").groupby(\"model_type\").mae.mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3498408-2ad8-4aed-a8bb-f6d722ba025a", - "metadata": {}, - "outputs": [], - "source": [ - "scores = pd.read_csv(\"HPO_1.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e5d4987-11af-4f8b-bc48-965bb7838310", - "metadata": {}, - "outputs": [], - "source": [ - "df.loc[df.split == \"test\"].groupby(\n", - " [\"learning_rate\", \"max_depth\", \"colsample_bytree\", \"lambda\"]\n", - ").mean().sort_values(by=\"r2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea82ac5c-5cbb-4b87-b739-f98249a73b02", - "metadata": {}, - "outputs": [], - "source": [ - "# from sklearn.preprocessing import OneHotEncoder\n", - "# igbp_encoder = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore').fit(df_train[['igbp']])\n", - "# # one hot encoding for igbp\n", - "# encoded_igbp = igbp_encoder.transform(X[['igbp']])\n", - "# X = X.drop(['igbp'], axis=1)\n", - "# for i in range(encoded_igbp.shape[1]):\n", - "# X[f'igbp_cat_{str(i+1)}'] = encoded_igbp[:, i]" - ] } ], "metadata": { From 2fbd2f7afb07eb10de9fdcb0174ddf64b82e4734 Mon Sep 17 00:00:00 2001 From: Oriana Date: Tue, 8 Feb 2022 21:08:42 +0000 Subject: [PATCH 2/2] add year column into the data frame for model training --- carbonplan_trace/v1/load.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/carbonplan_trace/v1/load.py b/carbonplan_trace/v1/load.py index 1bcc3bc..2f9e07b 100644 --- a/carbonplan_trace/v1/load.py +++ b/carbonplan_trace/v1/load.py @@ -235,10 +235,13 @@ def training(realm, y0=2003, y1=2010, reload=False, access_key_id=None, secret_a else: output = [] for yr in range(y0, y1): + print(yr) folder_name = f's3://carbonplan-climatetrace/v2/training/{realm}/{yr}/' files = fs.ls(folder_name) for f in files: - output.append(pd.read_parquet(f's3://{f}')) + single_df = pd.read_parquet(f's3://{f}') + single_df['year'] = yr + output.append(single_df) output = pd.concat(output) utils.write_parquet(output, output_filename, access_key_id, secret_access_key) return output