-
Notifications
You must be signed in to change notification settings - Fork 3
add slidescore url and slidescore study id to download manifest. #42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
a021ff7
46ebb0a
0a47294
c5e19a6
2790c91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,9 @@ | |
| This module contains the CLI utilities that can be used with slidescore in python. | ||
|
|
||
| """ | ||
|
|
||
| # pylint: disable=duplicate-code | ||
|
|
||
| import argparse | ||
| import csv | ||
| import json | ||
|
|
@@ -15,7 +18,7 @@ | |
| from collections import defaultdict | ||
| from enum import Enum | ||
| from pathlib import Path | ||
| from typing import Iterable, Optional | ||
| from typing import Iterable, List, Optional, Union | ||
|
|
||
| import shapely.geometry | ||
| from tqdm import tqdm | ||
|
|
@@ -70,6 +73,7 @@ def parse_api_token(data: Optional[Path] = None) -> str: | |
|
|
||
|
|
||
| def _shapely_to_slidescore(shapely_object): | ||
| # pylint: disable=too-many-branches | ||
| shapely_type = type(shapely_object) | ||
| if shapely_type == shapely.geometry.Polygon: | ||
| if len(shapely_object.interiors) != 0: | ||
|
|
@@ -351,30 +355,117 @@ def _download_labels(args: argparse.Namespace) -> None: | |
| ) | ||
|
|
||
|
|
||
| def append_to_manifest(save_dir: pathlib.Path, image_id: int, filename: pathlib.Path) -> None: | ||
| def append_to_tsv_mapping(save_dir: pathlib.Path, items: List[str]) -> None: | ||
| """ | ||
| Create a manifest mapping image id to the filename. | ||
| Create a manifest mapping image id to image name | ||
|
|
||
| Creates a file that looks like | ||
| ```txt | ||
| # <slidescore_url> | ||
| # <slidescore_study_id> | ||
| <image_id_1> <image_name_1> | ||
| ... | ||
| ``` | ||
|
|
||
| Parameters | ||
| ---------- | ||
| save_dir: pathlib.Path | ||
| items: List[str] | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| if not save_dir.is_dir(): | ||
| save_dir.mkdir(parents=True) | ||
| tab = "\t" | ||
| with open(save_dir / "slidescore_mapping.tsv", "a+", encoding="utf-8") as file: | ||
| file.write(f"{tab.join(items)}\n") | ||
|
|
||
|
|
||
| def append_to_json_mapping(save_dir: pathlib.Path, keys: List[str], value: Union[pathlib.Path, int, str]) -> None: | ||
| """ | ||
| Generic method to append a hierarchical key structure with one value to a dictionary in a json file to fix | ||
| missing functionality in python dict classes | ||
|
|
||
| Works as desired: | ||
| >> {}['a'] = 1 | ||
| {'a': 1'} | ||
|
|
||
| Does not work as desired in this case | ||
| >> {}['a', 'b'] = 1 | ||
| {('a', 'b'): 1} | ||
|
|
||
| The following is not possible in a python dict, and throws an error | ||
| >> {}[['a', 'b']] = 1 | ||
| TypeError: unhashable type: 'list' | ||
|
|
||
| We wish to get | ||
| >> {}[['a', 'b']] = 1 | ||
| {'a': {'b': 1}} | ||
|
|
||
| Used to create a manifest mapping filename to slidescore ID Created when downloading WSIs from slidescores. | ||
|
|
||
| Will end up with something like | ||
| { | ||
| 'slidescore_url': str, | ||
| 'slidescore_study_id': int, | ||
| 'slide_filename_to_id_mapping': { | ||
| str: int | ||
| ... | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not filename anymore is it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not the structure indeed. changed |
||
|
|
||
| Parameters | ||
| ---------- | ||
| save_dir : pathlib.Path | ||
| image_id : int | ||
| filename : pathlib.Path | ||
| keys : List[Union[str, pathlib.Path]], sets the hierarchical keys to be set. | ||
| E.g. ['slide_filename_id_mapping', 'filename'] | ||
| sets manifest['slide_filename_id_mapping']['filename'] to the given slidescore ID | ||
| value : Union[int, pathlib.Path, str], value to be set. Generally either a URL, a path, or an integer ID | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| with open(save_dir / "slidescore_mapping.txt", "a", encoding="utf-8") as file: | ||
| file.write(f"{image_id} {filename.name}\n") | ||
| # Make dir if it doesn't exist. Usage in the CLI, however, places it in an existing directory | ||
| if not save_dir.is_dir(): | ||
| save_dir.mkdir(parents=True) | ||
|
|
||
| value = value.name if isinstance(value, pathlib.Path) else value | ||
| config_filepath = save_dir / "slidescore_mapping.json" | ||
|
|
||
| try: | ||
| # Read file if it exists | ||
| with open(config_filepath, mode="r", encoding="utf-8") as file: | ||
| obj = json.load(file) | ||
| except FileNotFoundError: | ||
| # Otherwise create new object | ||
| obj = {} | ||
|
|
||
| new_obj = obj # Create a pointer that we can update | ||
| for idx, key in enumerate(keys): | ||
| if idx == len(keys) - 1: # At the last node | ||
| new_obj[key] = value # Set the leaf | ||
| else: | ||
| if key not in new_obj.keys(): | ||
| new_obj[key] = {} # Not at the last node and the key does not exist; make a subdict | ||
| new_obj = new_obj[key] # Update pointer | ||
|
|
||
| # Save file, overwriting the old file | ||
| with open(config_filepath, mode="w", encoding="utf-8") as file: | ||
| json.dump(obj, file, ensure_ascii=False, indent=4) | ||
|
|
||
|
|
||
| # pylint: disable=too-many-arguments | ||
| def download_wsis( | ||
| slidescore_url: str, | ||
| api_token: str, | ||
| study_id: int, | ||
| save_dir: pathlib.Path, | ||
| disable_certificate_check: bool = False, | ||
| disable_download: bool = False, | ||
| mapping_format: str = "json", | ||
|
Comment on lines
+502
to
+503
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there no other way to get the mapping only?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not that i can think of without duplicating a lot of code or entirely refactoring the function into less coherent parts. |
||
| ) -> None: | ||
| """ | ||
| Download all WSIs for a given study from SlideScore | ||
|
|
@@ -386,6 +477,9 @@ def download_wsis( | |
| study_id : int | ||
| save_dir : pathlib.Path | ||
| disable_certificate_check : bool | ||
| disable_download : bool | ||
| mapping_format: str | ||
| either of "json" or "tsv" | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -399,23 +493,56 @@ def download_wsis( | |
| # Collect image metadata | ||
| images = client.get_images(study_id) | ||
|
|
||
| # # Add study details to mapping manifest | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Single # |
||
| if mapping_format == "json": | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, keys=[slidescore_url, str(study_id), "slidescore_url"], value=slidescore_url | ||
| ) | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, keys=[slidescore_url, str(study_id), "slidescore_study_id"], value=study_id | ||
| ) | ||
| elif mapping_format == "tsv": | ||
| append_to_tsv_mapping(save_dir=save_dir, items=[f"# {slidescore_url}"]) | ||
| append_to_tsv_mapping(save_dir=save_dir, items=[f"# {study_id}"]) | ||
|
|
||
| # Download and save WSIs | ||
| for image in tqdm(images): | ||
| image_id = image["id"] | ||
| image_name = image["name"] | ||
| if not disable_download: | ||
| logger.info("Downloading image for id: %s", image_id) | ||
| filename = client.download_slide(study_id, image, save_dir=save_dir) | ||
| logger.info("Image with id %s has been saved to %s.", image_id, filename) | ||
| if mapping_format == "json": | ||
| append_to_json_mapping( | ||
| save_dir=save_dir, | ||
| keys=[slidescore_url, str(study_id), "slide_filename_to_study_image_id_mapping", str(image_id)], | ||
| value=image_name, | ||
| ) | ||
| elif mapping_format == "tsv": | ||
| append_to_tsv_mapping( | ||
| save_dir=save_dir, | ||
| items=[str(image_id), image_name], | ||
| ) | ||
|
Comment on lines
+550
to
+554
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If not one of these, it's better to raise an error
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Thought this was done by the argparser, but raising error here makes the function more generic |
||
|
|
||
| logger.info("Downloading image for id: %s", image_id) | ||
| filename = client.download_slide(study_id, image, save_dir=save_dir) | ||
| logger.info("Image with id %s has been saved to %s.", image_id, filename) | ||
| append_to_manifest(save_dir, image_id, filename) | ||
|
|
||
| def _download_mapping(args: argparse.Namespace): | ||
| """Main function that downloads only the mapping from filename to slidescore slide ID | ||
|
|
||
| Calls _download_wsi while setting `disable_download=True` | ||
| """ | ||
| _download_wsi(args=args, disable_download=True) | ||
|
|
||
|
|
||
| def _download_wsi(args: argparse.Namespace): | ||
| def _download_wsi(args: argparse.Namespace, disable_download=False): | ||
| """Main function that downloads WSIs from SlideScore. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| args: argparse.Namespace | ||
| The arguments passed from the CLI. Run with `-h` to see the required parameters | ||
| disable_download: bool | ||
| If download is disabled, only the mapping is saved. Can also be used to debug. | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -429,6 +556,8 @@ def _download_wsi(args: argparse.Namespace): | |
| args.study_id, | ||
| args.output_dir, | ||
| disable_certificate_check=args.disable_certificate_check, | ||
| disable_download=disable_download, | ||
| mapping_format=args.mapping_format, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -445,6 +574,43 @@ def register_parser(parser: argparse._SubParsersAction): | |
|
|
||
| download_wsi_parser.set_defaults(subcommand=_download_wsi) | ||
|
|
||
| download_wsi_parser.add_argument( | ||
| "--mapping-format", | ||
| dest="mapping_format", | ||
| type=str, | ||
| help="Save mapping as either json or tsv", | ||
| choices=["tsv", "json"], | ||
| required=False, | ||
| default="tsv", | ||
| ) | ||
|
|
||
| download_mapping_parser = parser.add_parser( | ||
| "download-study-slide-mapping", | ||
| help="Download the download_config.json" | ||
| " with url, study id, and file to " | ||
| "slidescore ID mapping from SlideScore" | ||
| " without downloading the WSIs. " | ||
| "Useful if slides are already on disk," | ||
| "but slidescore information is not", | ||
| ) | ||
| download_mapping_parser.add_argument( | ||
| "output_dir", | ||
| type=pathlib.Path, | ||
| help="Directory to save output too.", | ||
| ) | ||
|
|
||
| download_mapping_parser.add_argument( | ||
| "--mapping-format", | ||
| dest="mapping_format", | ||
| type=str, | ||
| help="Save mapping as either json or tsv", | ||
| choices=["tsv", "json"], | ||
| required=False, | ||
| default="tsv", | ||
| ) | ||
|
|
||
| download_mapping_parser.set_defaults(subcommand=_download_mapping) | ||
|
|
||
| download_label_parser = parser.add_parser("download-labels", help="Download labels from SlideScore.") | ||
| download_label_parser.add_argument( | ||
| "-q", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,87 @@ | ||
| # coding=utf-8 | ||
| # Placeholder | ||
| def test_test(): | ||
| assert True | ||
| """ | ||
| Unit tests | ||
| """ | ||
|
|
||
| # pylint: disable=duplicate-code | ||
|
|
||
| import json | ||
| from pathlib import Path | ||
|
|
||
| from slidescore_api.cli import append_to_json_mapping, append_to_tsv_mapping | ||
|
|
||
|
|
||
| def test_append_to_json_mapping(): | ||
| # Ignores similar lines | ||
| """ | ||
| Tests the function of slidescore_api.cli.append_to_manifest | ||
| """ | ||
| if Path("./test_append_to_json_mapping/slidescore_mapping.json").is_file(): | ||
| Path("./test_append_to_json_mapping/slidescore_mapping.json").unlink() # Remove file | ||
|
|
||
| append_to_json_mapping( | ||
| save_dir=Path("./test_append_to_json_mapping"), | ||
| keys=["slidescore_url", "1234", "slidescore_url"], | ||
| value="https", | ||
| ) # -> {"slidescore_url": {"1234": {"slidescore_url": "https}}} | ||
|
|
||
| append_to_json_mapping( | ||
| save_dir=Path("./test_append_to_json_mapping"), keys=["slidescore_url", "1234", "slidescore_id"], value=1234 | ||
| ) # -> {"slidescore_url": {"1234": {"slidescore_url": "https, "slidescore_id": 1234}}} | ||
|
|
||
| append_to_json_mapping( | ||
| save_dir=Path("./test_append_to_json_mapping"), | ||
| keys=["slidescore_url", "1234", "mapping", "1234"], | ||
| value="slidename", | ||
| ) | ||
| # -> {"slidescore_url": {"1234": {"slidescore_url": "https, "slidescore_id": 1234, | ||
| # "mapping": {"pathname": 1234} }}} | ||
|
|
||
| expected_object = { | ||
| "slidescore_url": { | ||
| "1234": {"slidescore_url": "https", "slidescore_id": 1234, "mapping": {"1234": "slidename"}} | ||
| } | ||
| } | ||
|
|
||
| with open("./test_append_to_json_mapping/slidescore_mapping.json", "r", encoding="utf-8") as file: | ||
| obj = json.load(file) | ||
| assert ( | ||
| obj == expected_object | ||
| ), f"expected object is {expected_object}, while the actual object on disk is {obj}" | ||
|
|
||
| Path("./test_append_to_json_mapping/slidescore_mapping.json").unlink() # Remove file | ||
| Path("./test_append_to_json_mapping").rmdir() # Remove dir | ||
|
|
||
|
|
||
| def test_append_to_tsv_mapping(): | ||
| # Ignores similar lines | ||
| """ | ||
| Tests the function of slidescore_api.cli.append_to_tsv_manifest | ||
| """ | ||
| if Path("./test_append_to_tsv_mapping/slidescore_mapping.tsv").is_file(): | ||
| Path("./test_append_to_tsv_mapping/slidescore_mapping.tsv").unlink() # Remove file | ||
|
|
||
| append_to_tsv_mapping(save_dir=Path("./test_append_to_tsv_mapping"), items=["# slidescore_url"]) | ||
| # -> {"slidescore_url": {"1234": {"slidescore_url": "https}}} | ||
|
|
||
| append_to_tsv_mapping(save_dir=Path("./test_append_to_tsv_mapping"), items=["# slidescore_study_id"]) | ||
|
|
||
| append_to_tsv_mapping(save_dir=Path("./test_append_to_tsv_mapping"), items=["image_id", "image_name"]) | ||
|
|
||
| lines = [] | ||
| with open("./test_append_to_tsv_mapping/slidescore_mapping.tsv", "r", encoding="utf-8") as file: | ||
| for line in file: | ||
| lines.append(line) | ||
|
|
||
| assert lines[0] == "# slidescore_url\n" | ||
| assert lines[1] == "# slidescore_study_id\n" | ||
| assert lines[2] == "image_id\timage_name\n" | ||
|
|
||
| Path("./test_append_to_tsv_mapping/slidescore_mapping.tsv").unlink() # Remove file | ||
| Path("./test_append_to_tsv_mapping").rmdir() # Remove dir | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| test_append_to_json_mapping() | ||
| test_append_to_tsv_mapping() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add that if the file does not exist yet, you write the first two lines?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do this manually in
cli.download_wsis.Otherwise I'd need to always pass the
urlandidto the function.Also, if this would be the functionality, then downloading a second slidescory study into the same directory would NOT write the slidescore_url and study_id of the second study.