diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 4616936..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,80 +0,0 @@ -# [Code of Conduct](https://opensource.fb.com/code-of-conduct/) - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or - advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. - -This Code of Conduct also applies outside the project spaces when there is a -reasonable belief that an individual's behavior may have a negative impact on -the project or its community. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 9eb9602..0000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,31 +0,0 @@ -# Contributing to PartDistillation -We want to make contributing to this project as easy and transparent as -possible. - -## Pull Requests -We actively welcome your pull requests. - -1. Fork the repo and create your branch from `main`. -2. If you've added code that should be tested, add tests. -3. If you've changed APIs, update the documentation. -4. Ensure the test suite passes. -5. Make sure your code lints. -6. If you haven't already, complete the Contributor License Agreement ("CLA"). - -## Contributor License Agreement ("CLA") -In order to accept your pull request, we need you to submit a CLA. You only need -to do this once to work on any of Facebook's open source projects. - -Complete your CLA here: - -## Issues -We use GitHub issues to track public bugs. Please ensure your description is -clear and has sufficient instructions to be able to reproduce the issue. - -Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe -disclosure of security bugs. In those cases, please go through the process -outlined on that page and do not file a public issue. - -## License -By contributing to OZI data, you agree that your contributions will be licensed -under the LICENSE file in the root directory of this source tree. diff --git a/Detic b/Detic deleted file mode 120000 index 6c70317..0000000 --- a/Detic +++ /dev/null @@ -1 +0,0 @@ -/private/home/janghyuncho7/EmergentPartSeg/Detic/ \ No newline at end of file diff --git a/README.md b/README.md index 7b06e5e..fc2a941 100644 --- a/README.md +++ b/README.md @@ -1,170 +1,86 @@ # PartDistillation: Learning Parts from Instance Segmentation -PartDistillation learns to segment parts over 21k object categories without labels. +PartDistillation learns to segment parts over 10k object categories without labels. -

+

-> [**PartDistillation: Learning Parts from Instance Segmentation**](http://arxiv.org/abs/xxxx), -> Jang Hyun Cho, Philipp Krähenbühl, Vignash Ramanathan, -> *arxiv ([arXiv xxxx](http://arxiv.org/abs/xxxx))* +> [**PartDistillation: Learning Parts from Instance Segmentation**](https://openaccess.thecvf.com/content/CVPR2023/papers/Cho_PartDistillation_Learning_Parts_From_Instance_Segmentation_CVPR_2023_paper.pdf), +> [Jang Hyun Cho](https://janghyuncho.github.io), [Philipp Krähenbühl](http://www.philkr.net), [Vignash Ramanathan](http://ai.stanford.edu/~vigneshr/), +> *CVPR 2023 [[paper](https://openaccess.thecvf.com/content/CVPR2023/papers/Cho_PartDistillation_Learning_Parts_From_Instance_Segmentation_CVPR_2023_paper.pdf), [project page](https://janghyuncho.github.io/PartDistillation_website/)*] Contact: janghyuncho7@utexas.edu -## Installation -Please see [installation instructions](). - -*Internal: See `INSTALL.md`.* - -## Getting Started - -See instructions for [preparing datasets]() and [preparing models]() for PartDistillation. - -*Internal: See `datasets/README.md` and `weights/README.md`.* - -## Training PartDistillation -PartDistillation has multiple stages to train the full model. -Parts are separated with object segmentation and we use Detic predictions to do the job. -To make the process fast, we save all detic predictions of ImageNet first. -### Save detic prediction +## :fire: News :fire: -``` -./sh_files/detic/run.sh -``` -Above code will launch 60 parallel jobs to run detic and save the result at `pseudo_labels/object_labels/imagenet_22k_train/detic_predictions/`. - - -### Pixel grouping for class-agnostic part segments - - -``` -./sh_files/proposal_generation/run.sh -``` - -Above code will launch 40 parallel jobs. Pixel-grouping is good initial segments, but rough. -Need to smooth out with postprocessing. We postprocess all part segments with dense-CRF with the following command. - -``` -./sh_files/dcrf/run.sh -``` -*NOTE: change the number of processes in the submit files to accomodate the resource availability.* - - -Then, we start training part proposal model (1st stage), which is a class-agnostic part segmentation model based on Mask2Former. - - -### Part-proposal Learning - -``` -./sh_files/proposal_learning/train_multi.sh -``` - -Above code will train on 4 nodes with 256 batch size. Then, we need to establish global association for each object class. -This allows to produce consistent class label for each part during inference. We call this process *part ranking*. - -### Part Ranking - -``` -./sh_files/part_ranking/run.sh -``` +- PartDistillation demo is out! +- ImageNet-1K training commands. +- Initial commit. -This generate part segmentation labels with class (as cluster assignment). With this, we self-train -the entire system all-together. +## Features +- Unsupervised part segmentation using [emergent part signals](docs/ANALYSIS.md) from strong instance segmentation model. +- Open-vocabulary object-part segmentation ([try out here](docs/DEMO.md)). +- Self-training to discover novel parts over 10K object classes (**No part segmentation labels used!**). +- Strong zero-shot and few-shot performance. -### PartDistillation Training - -``` -./sh_files/part_distillation_training/train.sh -``` - -This will launch 4 node job training on entire ImageNet-21K dataset. - - -## Ablations - -### Supervised learning +## Installation +Please see [installation instructions](docs/INSTALL.md). -Supervised models can be trained with commands in `sh_files/supervised_learning/`. For example, -``` -./sh_files/supervised_learning/semseg/pascal.sh -``` -will launch a 4 node job training a Mask2Former-based model (same configuration as ours) on `train` split of Pascal Parts dataset. +## DEMO -### Fewshot training +A short demo for PartDistillation with an image of a `person` and a `bicycle`: -In, `sh_files/fewshot_learning/` there are all training commands for training fewshot. For example, +

+Use the following command to segment each class: ``` -./sh_files/supervised_learning/semseg/pascal.sh +python part_distillation_demo.py --input figs/input/bicycle_person.jpg --output figs/output/part_proposal/bicycle.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary bicycle --min-image-size 640 --non-overlapping +python part_distillation_demo.py --input figs/input/bicycle_person.jpg --output figs/output/part_proposal/person.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary person --min-image-size 640 --non-overlapping ``` -will launch 1 node job finetuning a pretrained model in fewshot setting. - -*NOTE: please modify `model_weights` and `percent` variables based on your needs.* +If setup correctly, it should look like this: +

+ + +

-# Locations (Internal) - -## Original codes - -``` -/private/home/janghyuncho7/EmergentPartSeg -``` +## Getting Started -## Notebooks +See instructions for [preparing datasets](docs/DATASETS.md) and [preparing models](docs/WEIGHTS.md) to train PartDistillation. -``` -/private/home/janghyuncho7/EmergentPartSeg/notebooks/ -``` +## Using PartDistillation -## Refactored code +Please refer to our [demo](docs/DEMO.md) to explore. Also, see [checkpoints and inference](docs/MODELZOO.md) to learn about how to use PartDistillation. -``` -/private/home/janghyuncho7/PartDistillation -``` -## Collages -``` -/checkpoint/janghyuncho7/PartDistillation/manual_eval_related/collages/ -``` -- diversity evaluation: `/checkpoint/janghyuncho7/PartDistillation/manual_eval_related/diversity_eval` -- collages (imagenet-22k): `/checkpoint/janghyuncho7/PartDistillation/manual_eval_related/collages/imagenet_22k_train/detic_predictions/` -- one-stage baseline: `/checkpoint/janghyuncho7/PartDistillation/manual_eval_related/collages/imagenet_1k_train` -- per-pixel baseline: `/checkpoint/janghyuncho7/PartDistillation/manual_eval_related/collages/imagenet_22k_train/per_pixel_learning_baseline/collage_3x3/` +## Training PartDistillation +For now, we prepared [compute-friendly training commands](docs/TRAINING_1K.md) with ImageNet-1K dataset. +This setting only requires a single 8-GPU node and matches the reported results in zero-shot and few-shot benchmarks. -## Pseudo labels +*The [original training commands](docs/TRAINING.md) on ImageNet-21K here.* -``` -/checkpoint/janghyuncho7/PartDistillation/pseudo_labels_saved/ -``` -- detic prediction: `/checkpoint/janghyuncho7/PartDistillation/pseudo_labels_saved/object_labels/imagenet_22k_train/detic_predictions/`. -- part segments by pixel grouping (IN22K, detic): `/checkpoint/janghyuncho7/PartDistillation/pseudo_labels_saved/part_labels/proposal_generation/imagenet_22k_train/detic_based/generated_proposals_new_processed/res3_res4/dot_4_norm_False/` -- part segments by pixel grouping (IN1K, m2f COCO): `/checkpoint/janghyuncho7/PartDistillation/pseudo_labels_saved/part_labels/proposal_generation/imagenet_1k_train/generated_proposals_processed/score_based/res4/l2_4/` -- part segmentation labels by part ranking (IN22K): `/checkpoint/janghyuncho7/PartDistillation/pseudo_labels_saved/part_labels/part_masks_with_class/imagenet_22k_train/` -## trained models -``` -/checkpoint/janghyuncho7/PartDistillation/models/ -``` -- initial pretrain weights: `/checkpoint/janghyuncho7/PartDistillation/models/pre_weights/weights` -- per-pixel baselines: `/checkpoint/janghyuncho7/PartDistillation/models/per_pixel_baselines` -- our models: `/checkpoint/janghyuncho7/PartDistillation/models/our_models` -- few-shot models: `/checkpoint/janghyuncho7/PartDistillation/models/fewshot` +## Benchmark Training and Evaluation -## everything else +We have zero-shot and few-shot benchmarks on various datasets. Please see [benchmark training and evaluation](docs/BENCHMARK.md) for detail. -``` -/checkpoint/janghyuncho7/PartDistillation/ -``` +## License +Copyright (c) Meta Platforms, Inc. and affiliates. -## Reproduction +This source code is licensed under the license found in the LICENSE file in the root directory of this source tree. -New annotations and models are saved in `pseudo_labels/` and `output/`. Please change the path in submit files if new annotations and models will be used in public. +## Citation -## License -Copyright (c) Meta Platforms, Inc. and affiliates. +If you find this project useful for your research, please cite our paper using the following bibtex. -This source code is licensed under the license found in the -LICENSE file in the root directory of this source tree. + @InProceedings{Cho_2023_CVPR, + author = {Cho, Jang Hyun and Kr\"ahenb\"uhl, Philipp and Ramanathan, Vignesh}, + title = {PartDistillation: Learning Parts From Instance Segmentation}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2023}, + pages = {7152-7161} + } \ No newline at end of file diff --git a/base_trainer.py b/base_trainer.py index 719af00..331e5be 100644 --- a/base_trainer.py +++ b/base_trainer.py @@ -5,7 +5,7 @@ """ Base trainer module largely borrowed from Mask2Former: -https://github.com/facebookresearch/Mask2Former/blob/main/train_net.py +https://github.com/facebookresearch/Mask2Former/blob/main/train_net.py """ try: @@ -20,7 +20,7 @@ import itertools from typing import Any, Dict, List, Set import torch -import torch.nn as nn +import torch.nn as nn from detectron2.engine import DefaultTrainer from detectron2.projects.deeplab import build_lr_scheduler @@ -39,17 +39,17 @@ def get_mode(dataset_name): mode = "eval" elif "save_labels" in dataset_name: mode = "save" - return mode + return mode def maybe_dp(model): """ - helper function to access model with data parallel or distributed data parallel. + helper function to access model with data parallel or distributed data parallel. """ if isinstance(model, DDP) or isinstance(model, nn.DataParallel): - return model.module + return model.module else: - return model + return model class BaseTrainer(DefaultTrainer): @@ -93,11 +93,11 @@ def build_optimizer(cls, cfg, model): # Avoid duplicating parameters if value in memo: continue - - # NOTE: Use keyword such as "backbone" to easily freeze parameters. + + # NOTE: Use keyword such as "backbone" to easily freeze parameters. if len([_ for _ in cfg.MODEL.MASK_FORMER.FREEZE_KEYS if _ in module_name]) > 0: - value.requires_grad = False - continue + value.requires_grad = False + continue memo.add(value) hyperparams = copy.copy(defaults) @@ -114,7 +114,7 @@ def build_optimizer(cls, cfg, model): if isinstance(module, torch.nn.Embedding): hyperparams["weight_decay"] = weight_decay_embed params.append({"params": [value], **hyperparams}) - + def maybe_add_full_model_gradient_clipping(optim): # detectron2 doesn't have full model gradient clipping now clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE @@ -146,3 +146,4 @@ def step(self, closure=None): if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": optimizer = maybe_add_gradient_clipping(cfg, optimizer) return optimizer + diff --git a/configs/DeticLabeling.yaml b/configs/DeticLabeling.yaml new file mode 100644 index 0000000..85bae61 --- /dev/null +++ b/configs/DeticLabeling.yaml @@ -0,0 +1,24 @@ +_BASE_: "detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml" +MODEL: + META_ARCHITECTURE: "LabelingDetic" + WEIGHTS: "weights/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" + ROI_BOX_HEAD: + ZEROSHOT_WEIGHT_PATH: 'Detic/datasets/metadata/oid_clip_a+cname.npy' + ROI_HEADS: + NUM_CLASSES: 500 # not used + SCORE_THRESH_TEST: 0.0 # no threshold. +PROPOSAL_GENERATION: + BATCH_SIZE: 1 + DETIC_LABELING_MODE: "max-gt-label" + ROOT_FOLDER_NAME: "pseudo_labels" # name of the folder to save + SAVE_SCORE_THRESHOLD: 0.0 + DEBUG: False # set it to True for quick debug. + PARTITION_INDEX: 0 # used when multi processing + TOTAL_PARTITIONS: 0 # set to greater than 0 when multi processing +INPUT: + IMAGE_SIZE: 640 +DATASETS: + TEST: ('imagenet_1k_train',) +TEST: + DETECTIONS_PER_IMAGE: 1000 +OUTPUT_DIR: "output/detic/" \ No newline at end of file diff --git a/configs/PartDistillation.yaml b/configs/PartDistillation.yaml new file mode 100644 index 0000000..9619bc5 --- /dev/null +++ b/configs/PartDistillation.yaml @@ -0,0 +1,57 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("imagenet_1k_train",) + TEST: ("part_imagenet_match_val","part_imagenet_evaluate_val",) +MODEL: + SWIN: + USE_CHECKPOINT: True + WEIGHTS: "weights/PartProposalLearning/IN1K+Human/part_proposal_model.pth" + META_ARCHITECTURE: "PartDistillationModel" + MASK_FORMER: + TRANSFORMER_DECODER_NAME: "PartDistillationTransformerDecoder" + FREEZE_KEYS: ["backbone","encoder"] + IMPORTANCE_SAMPLE_RATIO: 3.0 + OVERSAMPLE_RATIO: 0.0 + QUERY_FEATURE_NORMALIZE: True # normalize part-level feature (query feature) +PART_DISTILLATION: + MIN_OBJECT_AREA_RATIO: 0.001 + MIN_AREA_RATIO: 0.001 + MIN_SCORE: 0.0 + DATASET_PATH_LIST: ("pseudo_labels/part_labels/part_masks_with_class/imagenet_1k_train/lr_0.00001_scale_0.1_2.0_sampling_ratio_0.0/dot_8/r1_0.0_s1_0.7_r2_0.0_s2_0.0/",) + DATASET_PATH: "pseudo_labels/part_labels/part_masks_with_class/imagenet_1k_train/lr_0.00001_scale_0.1_2.0_sampling_ratio_0.0/dot_8/r1_0.0_s1_0.7_r2_0.0_s2_0.0/" + USE_PER_PIXEL_LABEL: True + SET_IMAGE_SQUARE: True + APPLY_MASKING_WITH_OBJECT_MASK: True + PATH_ONLY: True + NUM_OBJECT_CLASSES: 1000 # imagenet-1k + NUM_PART_CLASSES: 8 # per-object-class num classes + DEBUG: False +CUSTOM_DATASETS: + USE_MERGED_GT: True + AUG_NAME_LIST: ["crop","scale","flip"] + BASE_SIZE: 640 +INPUT: + MIN_SIZE_TRAIN: (640,) + MIN_SIZE_TEST: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +TEST: + EVAL_PERIOD: 10000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + BASE_LR: 0.00001 + IMS_PER_BATCH: 32 + MAX_ITER: 90000 + STEPS: (75000, 85000) + AMP: + ENABLED: True +WANDB: + PROJECT: "PartDistillationTrain" + VIS_PERIOD_TRAIN: 2000 + VIS_PERIOD_TEST: 100 + DISABLE_WANDB: True # set it to False for W&B visualization. +FP16: True +USE_CHECKPOINT: True +OUTPUT_DIR: "output/PartDistillation/IN1K+Human/" +VIS_OUTPUT_DIR: "vis_logs/PartDistillation/IN1K+Human/" \ No newline at end of file diff --git a/configs/PartProposalLearning.yaml b/configs/PartProposalLearning.yaml new file mode 100644 index 0000000..9812cf4 --- /dev/null +++ b/configs/PartProposalLearning.yaml @@ -0,0 +1,55 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("imagenet_1k_train",) + TEST: ("pascal_part_val","pascal_part_val","part_imagenet_valtest","part_imagenet_valtest",) +MODEL: + SWIN: + USE_CHECKPOINT: False + WEIGHTS: "weights/mask2former/instance/swinL_i21k_q200_e100.pkl" + META_ARCHITECTURE: "ProposalModel" + SEM_SEG_HEAD: + NUM_CLASSES: 1 + MASK_FORMER: + FREEZE_KEYS: ["backbone","encoder"] + IMPORTANCE_SAMPLE_RATIO: 3.0 + OVERSAMPLE_RATIO: 0.0 +PROPOSAL_LEARNING: + MIN_OBJECT_AREA_RATIO: 0.01 + MIN_AREA_RATIO: 0.05 + MIN_SCORE: 0.0 + DATASET_PATH_LIST: ("pseudo_labels/part_labels/processed_proposals/max-gt-label/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/", "pseudo_labels/part_labels/processed_proposals/human-only/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/",) + DATASET_PATH: "pseudo_labels/part_labels/processed_proposals/max-gt-label/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/" + USE_PER_PIXEL_LABEL: False + APPLY_MASKING_WITH_OBJECT_MASK: True + POSTPROCESS_TYPES: ("prop","semseg","prop","semseg",) + PATH_ONLY: True + DEBUG: False # set to True for quick debug. +INPUT: + MIN_SIZE_TRAIN: (640,) + MAX_SIZE_TRAIN: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +CUSTOM_DATASETS: + USE_MERGED_GT: True + AUG_NAME_LIST: ["crop","scale","flip"] + BASE_SIZE: 640 +TEST: + EVAL_PERIOD: 10000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + BASE_LR: 0.00001 + IMS_PER_BATCH: 64 + MAX_ITER: 90000 + STEPS: (75000, 85000) + AMP: + ENABLED: True +WANDB: + PROJECT: "proposal_learning" + VIS_PERIOD_TRAIN: 2000 + VIS_PERIOD_TEST: 100 + DISABLE_WANDB: True # set it to False for W&B visualization. +FP16: True +USE_CHECKPOINT: True +OUTPUT_DIR: "output/PartProposalLearning/IN1K+Human/" +VIS_OUTPUT_DIR: "vis_logs/PartProposalLearning/IN1K+Human/" \ No newline at end of file diff --git a/configs/PartRanking.yaml b/configs/PartRanking.yaml new file mode 100644 index 0000000..8cf3322 --- /dev/null +++ b/configs/PartRanking.yaml @@ -0,0 +1,55 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("imagenet_1k_train",) + TEST: ("imagenet_1k_pre_labeling_train","imagenet_1k_post_labeling_train",) +MODEL: + SWIN: + USE_CHECKPOINT: False + WEIGHTS: "weights/PartProposalLearning/IN1K+Human/part_proposal_model.pth" + META_ARCHITECTURE: "PartRankingModel" + SEM_SEG_HEAD: + NUM_CLASSES: 1 +PART_RANKING: + ROOT_FOLDER_NAME: "pseudo_labels" + WEIGHT_NAME: "proposal_model" + MIN_AREA_RATIO_1: 0.0 + MIN_SCORE_1: 0.7 + MIN_AREA_RATIO_2: 0.0 + MIN_SCORE_2: 0.0 + DATASET_PATH: "pseudo_labels/part_labels/processed_proposals/max-gt-label/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/" + DATASET_PATH_LIST: ("pseudo_labels/part_labels/processed_proposals/max-gt-label/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/",) + USE_PER_PIXEL_LABEL_DURING_CLUSTERING: True + USE_PER_PIXEL_LABEL_DURING_LABELING: True + PROPOSAL_KEY: "decoder_output" + CLASSIFIER_METRIC: "dot" + NUM_CLUSTERS: 8 + APPLY_MASKING_WITH_OBJECT_MASK: True + PROPOSAL_FEATURE_NORM: False + PARTITION_INDEX: 0 + TOTAL_PARTITIONS: 0 + SAVE_ANNOTATIONS: True + DEBUG: False +CUSTOM_DATASETS: + USE_MERGED_GT: True +INPUT: + MIN_SIZE_TRAIN: (640,) + MAX_SIZE_TRAIN: 640 + MIN_SIZE_TEST: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +TEST: + EVAL_PERIOD: 5000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + IMS_PER_BATCH: 32 +WANDB: + PROJECT: "part_ranking" + VIS_PERIOD_TRAIN: 200 + VIS_PERIOD_TEST: 200 + DISABLE_WANDB: True # set it to False for W&B visualization. +OUTPUT_DIR: "output/PartRanking/IN1K+Human" +FP16: True +USE_CHECKPOINT: True +OUTPUT_DIR: "output/PartRanking/IN1K+Human/" +VIS_OUTPUT_DIR: "vis_logs/PartRanking/IN1K+Human/" \ No newline at end of file diff --git a/configs/PixelGrouping.yaml b/configs/PixelGrouping.yaml new file mode 100644 index 0000000..d5f858a --- /dev/null +++ b/configs/PixelGrouping.yaml @@ -0,0 +1,31 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("imagenet_1k_train",) + TEST: ("part_imagenet_valtest",) +MODEL: + SWIN: + USE_CHECKPOINT: False + WEIGHTS: "weights/mask2former/instance/swinL_i21k_q200_e100.pkl" + META_ARCHITECTURE: "PixelGroupingModel" +PIXEL_GROUPING: + FEATURE_NORMALIZE: False + NUM_SUPERPIXEL_CLUSTERS: 4 + DISTANCE_METRIC: "dot" + BACKBONE_FEATURE_KEY_LIST: ["res3","res4"] +CUSTOM_DATASETS: + USE_MERGED_GT: True +INPUT: + MIN_SIZE_TRAIN: (640,) + MIN_SIZE_TEST: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +TEST: + EVAL_PERIOD: 5000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + IMS_PER_BATCH: 32 +WANDB: + PROJECT: "PixelGrouping" + DISABLE_WANDB: True + VIS_PERIOD_TEST: 30 \ No newline at end of file diff --git a/configs/ProposalGeneration.yaml b/configs/ProposalGeneration.yaml new file mode 100644 index 0000000..844a0ea --- /dev/null +++ b/configs/ProposalGeneration.yaml @@ -0,0 +1,39 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("imagenet_1k_train",) + TEST: ("imagenet_1k_train",) +MODEL: + SWIN: + USE_CHECKPOINT: False + WEIGHTS: "weights/mask2former/instance/swinL_i21k_q200_e100.pkl" + META_ARCHITECTURE: "ProposalGenerationModel" +PROPOSAL_GENERATION: + BATCH_SIZE: 1 + DETIC_LABELING_MODE: "max-gt-label" + OBJECT_MASK_PATH: "pseudo_labels/object_labels/detic_predictions/max-gt-label/imagenet_1k_train/" + NUM_SUPERPIXEL_CLUSTERS: 4 + DATASET_NAME: "imagenet_1k_train" + OBJECT_MASK_TYPE: "detic" + WITH_GIVEN_MASK: True + DISTANCE_METRIC: "dot" + BACKBONE_FEATURE_KEY_LIST: '["res3","res4"]' + FEATURE_NORMALIZE: False + ROOT_FOLDER_NAME: "pseudo_labels" # name of the folder to save + SAVE_SCORE_THRESHOLD: 0.0 + DEBUG: False # set it to True for quick debug. + PARTITION_INDEX: 0 # used when multi processing + TOTAL_PARTITIONS: 0 # set to greater than 0 when multi processing +INPUT: + MIN_SIZE_TRAIN: (640,) + MAX_SIZE_TRAIN: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" +TEST: + EVAL_PERIOD: 1000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + IMS_PER_BATCH: 32 +WANDB: + PROJECT: "proposal_generation" + VIS_PERIOD_TEST: 2000 + DISABLE_WANDB: True # set it to False for W&B visualization. \ No newline at end of file diff --git a/configs/SupervisedLearning.yaml b/configs/SupervisedLearning.yaml new file mode 100644 index 0000000..e3cee39 --- /dev/null +++ b/configs/SupervisedLearning.yaml @@ -0,0 +1,40 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("part_imagenet_train",) + TEST: ("pascal_part_valtest",) +MODEL: + SWIN: + USE_CHECKPOINT: True + WEIGHTS: "weights/mask2former/instance/swinL_i21k_q200_e100.pkl" + META_ARCHITECTURE: "SupervisedModel" + SEM_SEG_HEAD: + NUM_CLASSES: 50 +SUPERVISED_MODEL: + USE_PER_PIXEL_LABEL: True + APPLY_MASKING_WITH_OBJECT_MASK: True + CLASS_AGNOSTIC_LEARNING: False + CLASS_AGNOSTIC_INFERENCE: False +INPUT: + MIN_SIZE_TRAIN: (640,) + MAX_SIZE_TRAIN: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +CUSTOM_DATASETS: + USE_MERGED_GT: True + AUG_NAME_LIST: ["crop","scale","flip"] + BASE_SIZE: 640 +TEST: + EVAL_PERIOD: 10000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + IMS_PER_BATCH: 32 + AMP: + ENABLED: True +FP16: True +USE_CHECKPOINT: True +WANDB: + PROJECT: "supervised_learning" + VIS_PERIOD_TRAIN: 2000 + VIS_PERIOD_TEST: 100 + DISABLE_WANDB: True # set it to False for W&B visualization. \ No newline at end of file diff --git a/configs/SupervisedPartProposalLearning.yaml b/configs/SupervisedPartProposalLearning.yaml new file mode 100644 index 0000000..ffb77fd --- /dev/null +++ b/configs/SupervisedPartProposalLearning.yaml @@ -0,0 +1,39 @@ +_BASE_: mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +DATASETS: + TRAIN: ("part_imagenet_train",) + TEST: ("pascal_part_valtest",) +MODEL: + SWIN: + USE_CHECKPOINT: True + WEIGHTS: "weights/mask2former/instance/swinL_i21k_q200_e100.pkl" + META_ARCHITECTURE: "SupervisedModel" + SEM_SEG_HEAD: + NUM_CLASSES: 1 +SUPERVISED_MODEL: + USE_PER_PIXEL_LABEL: False + APPLY_MASKING_WITH_OBJECT_MASK: True + CLASS_AGNOSTIC_LEARNING: True +INPUT: + MIN_SIZE_TRAIN: (640,) + MAX_SIZE_TRAIN: 640 + IMAGE_SIZE: 640 + MASK_FORMAT: "bitmask" + SIZE_DIVISIBILITY: 16 +CUSTOM_DATASETS: + USE_MERGED_GT: True + AUG_NAME_LIST: ["crop","scale","flip"] + BASE_SIZE: 640 +TEST: + EVAL_PERIOD: 10000 + DETECTIONS_PER_IMAGE: 200 +SOLVER: + IMS_PER_BATCH: 32 + AMP: + ENABLED: True +FP16: True +USE_CHECKPOINT: True +WANDB: + PROJECT: "supervised_part_proposal_learning" + VIS_PERIOD_TRAIN: 2000 + VIS_PERIOD_TEST: 100 + DISABLE_WANDB: True # set it to False for W&B visualization. \ No newline at end of file diff --git a/configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml b/configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml index 44349f2..2fd3c42 100644 --- a/configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml +++ b/configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml @@ -1,6 +1,6 @@ _BASE_: "Base-C2_L_R5021k_640b64_4x.yaml" MODEL: - WEIGHTS: "models/BoxSup-C2_LCOCO_CLIP_SwinB_896b32_4x.pth" + WEIGHTS: "weights/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" DYNAMIC_CLASSIFIER: True ROI_BOX_HEAD: USE_ZEROSHOT_CLS: True @@ -40,4 +40,6 @@ DATALOADER: DATASET_ANN: ['box', 'image'] NUM_WORKERS: 4 USE_TAR_DATASET: True +INPUT: + MIN_SIZE_TEST: 640 WITH_IMAGE_LABELS: True \ No newline at end of file diff --git a/configs/detic/Detic_Labeling.yaml b/configs/detic/Detic_Labeling.yaml deleted file mode 100644 index 3794adf..0000000 --- a/configs/detic/Detic_Labeling.yaml +++ /dev/null @@ -1,10 +0,0 @@ -_BASE_: "Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml" -MODEL: - META_ARCHITECTURE: "LabelingDetic" - WEIGHTS: "weights/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" - ROI_BOX_HEAD: - ZEROSHOT_WEIGHT_PATH: 'Detic/datasets/metadata/oid_clip_a+cname.npy' - ROI_HEADS: - NUM_CLASSES: 500 -DATASETS: - TEST: ('imagenet_1k_train',) \ No newline at end of file diff --git a/configs/part_distillation/swinL_IN21K_384_mask2former.yaml b/configs/part_distillation/swinL_IN21K_384_mask2former.yaml deleted file mode 100644 index 6eca618..0000000 --- a/configs/part_distillation/swinL_IN21K_384_mask2former.yaml +++ /dev/null @@ -1,24 +0,0 @@ -_BASE_: ../mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml -DATASETS: - TRAIN: ("imagenet_1k_train",) - TEST: ("part_imagenet_match_val","part_imagenet_evaluate_val",) -MODEL: - SWIN: - USE_CHECKPOINT: False - WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" - META_ARCHITECTURE: "PartDistillationModel" - MASK_FORMER: - TRANSFORMER_DECODER_NAME: "PartDistillationTransformerDecoder" -INPUT: - MIN_SIZE_TRAIN: (640,) - MIN_SIZE_TEST: 640 - IMAGE_SIZE: 640 - MASK_FORMAT: "bitmask" - SIZE_DIVISIBILITY: 16 -TEST: - EVAL_PERIOD: 5000 - DETECTIONS_PER_IMAGE: 200 -SOLVER: - IMS_PER_BATCH: 32 -WANDB: - PROJECT: "PartDistillationTrain" \ No newline at end of file diff --git a/configs/part_ranking/swinL_IN21K_384_mask2former.yaml b/configs/part_ranking/swinL_IN21K_384_mask2former.yaml deleted file mode 100644 index 5624af8..0000000 --- a/configs/part_ranking/swinL_IN21K_384_mask2former.yaml +++ /dev/null @@ -1,25 +0,0 @@ -_BASE_: ../mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml -DATASETS: - TRAIN: ("imagenet_1k_train",) - TEST: ("imagenet_1k_train",) -MODEL: - SWIN: - USE_CHECKPOINT: False - WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" - META_ARCHITECTURE: "PartRankingModel" - SEM_SEG_HEAD: - NUM_CLASSES: 1 -INPUT: - MIN_SIZE_TRAIN: (640,) - MAX_SIZE_TRAIN: 640 - MIN_SIZE_TEST: 640 - IMAGE_SIZE: 640 - MASK_FORMAT: "bitmask" - SIZE_DIVISIBILITY: 16 -TEST: - EVAL_PERIOD: 5000 - DETECTIONS_PER_IMAGE: 200 -SOLVER: - IMS_PER_BATCH: 32 -WANDB: - PROJECT: "part_ranking" \ No newline at end of file diff --git a/configs/pixel_grouping/maskformer2_R101_bs16_50ep.yaml b/configs/pixel_grouping/ClusteringBaseline.yaml similarity index 100% rename from configs/pixel_grouping/maskformer2_R101_bs16_50ep.yaml rename to configs/pixel_grouping/ClusteringBaseline.yaml diff --git a/configs/pixel_grouping/swinL_IN21K_384_mask2former.yaml b/configs/pixel_grouping/PixelGroupingModel.yaml similarity index 78% rename from configs/pixel_grouping/swinL_IN21K_384_mask2former.yaml rename to configs/pixel_grouping/PixelGroupingModel.yaml index fa1f595..09f799e 100644 --- a/configs/pixel_grouping/swinL_IN21K_384_mask2former.yaml +++ b/configs/pixel_grouping/PixelGroupingModel.yaml @@ -7,6 +7,12 @@ MODEL: USE_CHECKPOINT: False WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" META_ARCHITECTURE: "PixelGroupingModel" +PIXEL_GROUPING: + NUM_SUPERPIXEL_CLUSTERS: 4 + DISTANCE_METRIC: "dot" + BACKBONE_FEATURE_KEY_LIST: ["res3","res4"] +CUSTOM_DATASETS: + USE_MERGED_GT: True INPUT: MIN_SIZE_TRAIN: (640,) MIN_SIZE_TEST: 640 diff --git a/configs/proposal_generation/swinL_IN21K_384_mask2former.yaml b/configs/proposal_generation/swinL_IN21K_384_mask2former.yaml deleted file mode 100644 index 01195c3..0000000 --- a/configs/proposal_generation/swinL_IN21K_384_mask2former.yaml +++ /dev/null @@ -1,23 +0,0 @@ -_BASE_: ../mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml -DATASETS: - TRAIN: ("imagenet_22k_train",) - TEST: ("imagenet_22k_train",) -MODEL: - SWIN: - USE_CHECKPOINT: False - WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" - META_ARCHITECTURE: "ProposalGenerationModel" -INPUT: - MIN_SIZE_TRAIN: (640,) - MAX_SIZE_TRAIN: 640 - IMAGE_SIZE: 640 - MASK_FORMAT: "bitmask" -TEST: - EVAL_PERIOD: 1000 - DETECTIONS_PER_IMAGE: 200 -SOLVER: - IMS_PER_BATCH: 32 -PROPOSAL_GENERATION: - DATASET_NAME: "imagenet_1k_train" -WANDB: - PROJECT: "proposal_generation" \ No newline at end of file diff --git a/configs/proposal_learning/swinL_IN21K_384_mask2former.yaml b/configs/proposal_learning/swinL_IN21K_384_mask2former.yaml deleted file mode 100644 index f42d683..0000000 --- a/configs/proposal_learning/swinL_IN21K_384_mask2former.yaml +++ /dev/null @@ -1,24 +0,0 @@ -_BASE_: ../mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml -DATASETS: - TRAIN: ("imagenet_22k_train",) - TEST: ("pascal_part_valtest",) -MODEL: - SWIN: - USE_CHECKPOINT: False - WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" - META_ARCHITECTURE: "ProposalModel" - SEM_SEG_HEAD: - NUM_CLASSES: 1 -INPUT: - MIN_SIZE_TRAIN: (640,) - MAX_SIZE_TRAIN: 640 - IMAGE_SIZE: 640 - MASK_FORMAT: "bitmask" - SIZE_DIVISIBILITY: 16 -TEST: - EVAL_PERIOD: 5000 - DETECTIONS_PER_IMAGE: 200 -SOLVER: - IMS_PER_BATCH: 32 -WANDB: - PROJECT: "proposal_learning" \ No newline at end of file diff --git a/configs/supervised_learning/swinL_IN21K_384_mask2former.yaml b/configs/supervised_learning/swinL_IN21K_384_mask2former.yaml deleted file mode 100644 index 583a509..0000000 --- a/configs/supervised_learning/swinL_IN21K_384_mask2former.yaml +++ /dev/null @@ -1,24 +0,0 @@ -_BASE_: ../mask2former/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml -DATASETS: - TRAIN: ("pascal_part_train",) - TEST: ("pascal_part_valtest",) -MODEL: - SWIN: - USE_CHECKPOINT: False - WEIGHTS: "weights/m2f/instance/swinL_i21k_q200_e100.pkl" - META_ARCHITECTURE: "SupervisedModel" - SEM_SEG_HEAD: - NUM_CLASSES: 50 -INPUT: - MIN_SIZE_TRAIN: (640,) - MAX_SIZE_TRAIN: 640 - IMAGE_SIZE: 640 - MASK_FORMAT: "bitmask" - SIZE_DIVISIBILITY: 16 -TEST: - EVAL_PERIOD: 5000 - DETECTIONS_PER_IMAGE: 200 -SOLVER: - IMS_PER_BATCH: 32 -WANDB: - PROJECT: "supervised_learning" \ No newline at end of file diff --git a/continuously_postprocess_dcrf.py b/continuously_postprocess_dcrf.py index d1412d8..0863261 100644 --- a/continuously_postprocess_dcrf.py +++ b/continuously_postprocess_dcrf.py @@ -3,15 +3,16 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os + +import os import copy -import argparse -import torch -import numpy as np -import matplotlib.pyplot as plt +import argparse +import torch +import numpy as np +import matplotlib.pyplot as plt import pydensecrf.densecrf as dcrf import pydensecrf.utils as dcrf_utils -import time +import time from pycocotools import mask as coco_mask from detectron2.structures import BoxMode @@ -20,28 +21,39 @@ from detectron2.structures import BitMasks, Instances -def dense_crf(image, label, n_labels, p=0.7, t=10, sd1=3, sd2=20, sc=13, compat1=3, compat2=10): +def dense_crf( + image, + label, + n_labels, + p=0.7, + t=10, + sd1=3, + sd2=20, + sc=13, + compat1=3, + compat2=10 +): annotated_label = label.to(torch.int32).numpy() colors, labels = np.unique(annotated_label, return_inverse=True) c = image.shape[2] h = image.shape[0] w = image.shape[1] - + d = dcrf.DenseCRF2D(w, h, n_labels) U = dcrf_utils.unary_from_labels(labels, n_labels, gt_prob=p, zero_unsure=False) d.setUnaryEnergy(U) - + # This adds the color-independent term, features are the locations only. feats = dcrf_utils.create_pairwise_gaussian(sdims=(sd1, sd1), shape=(h, w)) d.addPairwiseEnergy(feats, compat=compat1, kernel=dcrf.DIAG_KERNEL, normalization=dcrf.NORMALIZE_SYMMETRIC) # This adds the color-dependent term, i.e. features are (x,y,r,g,b). - feats = dcrf_utils.create_pairwise_bilateral(sdims=(sd2, sd2), schan=(sc, sc, sc), + feats = dcrf_utils.create_pairwise_bilateral(sdims=(sd2, sd2), schan=(sc, sc, sc), img=image, chdim=2) - d.addPairwiseEnergy(feats, compat=compat2, + d.addPairwiseEnergy(feats, compat=compat2, kernel=dcrf.DIAG_KERNEL, normalization=dcrf.NORMALIZE_SYMMETRIC) @@ -68,7 +80,7 @@ def proposals_to_coco_json(binary_mask): # the pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") - return [{"segmentation": rle} for rle in rles] + return [{"segmentation": rle} for rle in rles] def get_argparse(): @@ -81,83 +93,109 @@ def get_argparse(): parser.add_argument('--res', type=str, default="res3_res4") parser.add_argument('--num_k', type=int, default=4) parser.add_argument('--feat_norm', action="store_true", default=False) + parser.add_argument('--check_broken_files', action="store_true", default=False) + parser.add_argument('--root_folder_name', type=str, default='pseudo_labels') + parser.add_argument('--label_mode', type=str, default='max-gt-label') parser.add_argument('--debug', action="store_true") return parser.parse_args() -path_root = "pseudo_labels/proposal_generation/" -# dcrf is done on larger resolution for performance reason. + +# dcrf is done on larger resolution for performance reason. +# the predictions already resized to 640. augs = [T.ResizeScale(min_scale=1.0, max_scale=1.0, target_height=640, target_width=640), - T.FixedSizeCrop(crop_size=(640, 640)), + # T.FixedSizeCrop(crop_size=(640, 640)), ] if __name__ == "__main__": args = get_argparse() - source_root = os.path.join(path_root, args.dataset_name, "detic_based", "generated_proposals", args.res, "{}_{}".format(args.dist_metric, args.num_k)) - target_root = os.path.join(path_root, args.dataset_name, "detic_based", "generated_proposals_processed", args.res, "{}_{}".format(args.dist_metric, args.num_k)) - + source_root = f"{args.root_folder_name}/part_labels/proposal_generation/{args.label_mode}/" + target_root = f"{args.root_folder_name}/part_labels/processed_proposals/{args.label_mode}/" + source_root = os.path.join(source_root, args.dataset_name, "detic", args.res, \ + "{}_{}_norm_{}".format(args.dist_metric, args.num_k, args.feat_norm)) + target_root = os.path.join(target_root, args.dataset_name, "detic", args.res, \ + "{}_{}_norm_{}".format(args.dist_metric, args.num_k, args.feat_norm)) + + # partition the list of imagnet classes for each process. code_list = os.listdir(source_root) if args.num_parallel_jobs > 0: num_total_classes = len(code_list) num_classes_per_job = num_total_classes // args.num_parallel_jobs - num_remaining_classes = num_total_classes - args.num_parallel_jobs * num_classes_per_job - num_current_job_classes = num_classes_per_job + num_remaining_classes = num_total_classes - args.num_parallel_jobs * num_classes_per_job + num_current_job_classes = num_classes_per_job - start_i = num_current_job_classes * (args.parallel_job_id-1) - end_i = num_current_job_classes * args.parallel_job_id - if args.parallel_job_id == args.num_parallel_jobs: + start_i = num_current_job_classes * args.parallel_job_id + end_i = num_current_job_classes * (args.parallel_job_id+1) + if args.parallel_job_id+1 == args.num_parallel_jobs: end_i = num_total_classes code_list = code_list[start_i:end_i] + # make folders for code in code_list: if not os.path.exists(os.path.join(target_root, code)): os.makedirs(os.path.join(target_root, code)) - - num_total = 0 + + # count total number of files to make + num_total = 0 for code in code_list: num_total += len(os.listdir(os.path.join(source_root, code))) - t0 = time.time() + t0 = time.time() + while True: count = 0 for code in code_list: fname_list = os.listdir(os.path.join(source_root, code)) for fname in fname_list: if not os.path.exists(os.path.join(target_root, code, fname)): - data = torch.load(os.path.join(source_root, code, fname), "cpu") - mask = data["part_masks"] - # mask = data["part_mask"] + try: + data = torch.load(os.path.join(source_root, code, fname), "cpu") + except: + print("broken file:", os.path.join(source_root, code, fname)) + continue + mask = data["part_mask"] if mask is not None: - # image = utils.read_image(data["file_path"], format="RGB") - image = utils.read_image(data["file_name"], format="RGB") - # Resizing + image = utils.read_image(data["file_path"], format="RGB") + + # Resizing aug_input = T.AugInput(image) aug_input, transforms = T.apply_transform_gens(augs, aug_input) image = aug_input.image - bmask = [] + bmask = [] for segm in mask: bmask.append(coco_mask.decode(segm["segmentation"])) bmask = torch.tensor(np.array(bmask)) assert image.shape[:2] == bmask.shape[1:], "tensor shapes do not match. ({} != {})"\ .format(image.shape[:2], bmask.shape[1:]) - num_c = bmask.shape[0] + num_c = bmask.shape[0] cmask = (bmask * (torch.arange(num_c) + 1)[:, None, None]).sum(0) cmask = torch.tensor(dense_crf(image, cmask, num_c + 1)) - o_cls = cmask.unique() + o_cls = cmask.unique() o_cls = o_cls[o_cls != 0] - bmask = torch.zeros(len(o_cls), *cmask.shape).bool() + bmask = torch.zeros(len(o_cls), *cmask.shape).bool() for i, c in enumerate(o_cls): bmask[i] = cmask == c - data["part_masks"] = proposals_to_coco_json(bmask) + del data['part_mask'] + data["part_mask"] = proposals_to_coco_json(bmask) if args.debug: assert False, "debug. " - + torch.save(data, os.path.join(target_root, code, fname)) - if count % 1000 == 1: + if count % 100 == 1: print("{} ({:.2f} %) images processed on process {} ({:.2f} / image)"\ .format(count, count/num_total*100, args.parallel_job_id, (time.time()-t0)/count), flush=True) - count += 1 + count += 1 + if count == num_total: + print("process {} done. (processed {}/{} images)".format(args.parallel_job_id, count, num_total), flush=True) + break + + + + + + + diff --git a/datasets/cityscapes_part b/datasets/cityscapes_part deleted file mode 120000 index 31b7c03..0000000 --- a/datasets/cityscapes_part +++ /dev/null @@ -1 +0,0 @@ -/checkpoint/janghyuncho7/PartDistillation/datasets/cityscapes/ \ No newline at end of file diff --git a/datasets/imagenet_1k b/datasets/imagenet_1k deleted file mode 120000 index 3ee405b..0000000 --- a/datasets/imagenet_1k +++ /dev/null @@ -1 +0,0 @@ -/datasets01/imagenet_full_size/061417/ \ No newline at end of file diff --git a/datasets/imagenet_22k b/datasets/imagenet_22k deleted file mode 120000 index 2e7a735..0000000 --- a/datasets/imagenet_22k +++ /dev/null @@ -1 +0,0 @@ -/datasets01/imagenet-22k/062717/ \ No newline at end of file diff --git a/datasets/metadata/imagenet_1k_fname_classname_dict.pkl b/datasets/metadata/imagenet_1k_fname_classname_dict.pkl new file mode 100644 index 0000000..7b718dd Binary files /dev/null and b/datasets/metadata/imagenet_1k_fname_classname_dict.pkl differ diff --git a/datasets/metadata/imagenet_1k_simple_labels.pkl b/datasets/metadata/imagenet_1k_simple_labels.pkl new file mode 100644 index 0000000..15864cf Binary files /dev/null and b/datasets/metadata/imagenet_1k_simple_labels.pkl differ diff --git a/datasets/part_imagenet b/datasets/part_imagenet deleted file mode 120000 index cf6a132..0000000 --- a/datasets/part_imagenet +++ /dev/null @@ -1 +0,0 @@ -/checkpoint/janghyuncho7/PartDistillation/datasets/imagenet_parts/ \ No newline at end of file diff --git a/datasets/pascal_parts/annotations b/datasets/pascal_parts/annotations deleted file mode 120000 index d697f74..0000000 --- a/datasets/pascal_parts/annotations +++ /dev/null @@ -1 +0,0 @@ -/datasets01/PASCAL_MT/101218/PascalParts/Annotations_Part/ \ No newline at end of file diff --git a/datasets/pascal_parts/images b/datasets/pascal_parts/images deleted file mode 120000 index d7ebb70..0000000 --- a/datasets/pascal_parts/images +++ /dev/null @@ -1 +0,0 @@ -/datasets01/VOC/060817/VOCdevkit/VOC2012/ \ No newline at end of file diff --git a/detic_labeling_net.py b/detic_labeling_net.py index 2392b2a..0edea22 100644 --- a/detic_labeling_net.py +++ b/detic_labeling_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -21,7 +22,7 @@ from collections import OrderedDict from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, +from detectron2.data import (MetadataCatalog, build_detection_test_loader) from detectron2.engine import default_argument_parser, default_setup, launch @@ -57,39 +58,45 @@ def get_clip_embeddings(vocabulary, prompt='a '): -def prepare_model(model, dataset_name, debug): +def prepare_model(model, dataset_name, labeling_mode, score_thres, debug): logger.info("preparing model for {}.".format(dataset_name)) metadata = MetadataCatalog.get(dataset_name) - maybe_dp(model).register_metadata(metadata, debug) + maybe_dp(model).register_metadata(metadata, labeling_mode, score_thres, debug) # Setup clip classifier with class names. - classifier = get_clip_embeddings(metadata.classes) - num_classes = len(metadata.classes) + if labeling_mode == 'human-only': + metadata.class_names = ["person", "man", "woman", "toddler", "human"] + else: + metadata.class_names = metadata.classes + classifier = get_clip_embeddings(metadata.class_names) + num_classes = len(metadata.class_names) reset_cls_test(model, classifier, num_classes) return model - def do_label(cfg, model): results = OrderedDict() for d, dataset_name in enumerate(cfg.DATASETS.TEST): - model = prepare_model(model, dataset_name, cfg.PROPOSAL_GENERATION.DEBUG) + label_mode = cfg.PROPOSAL_GENERATION.DETIC_LABELING_MODE + score_thres = cfg.PROPOSAL_GENERATION.SAVE_SCORE_THRESHOLD + model = prepare_model(model, dataset_name, label_mode, score_thres, + cfg.PROPOSAL_GENERATION.DEBUG) mapper = ProposalGenerationMapper(cfg) - data_loader = build_detection_test_loader(cfg, dataset_name, - batch_size=cfg.PROPOSAL_GENERATION.BATCH_SIZE, + data_loader = build_detection_test_loader(cfg, dataset_name, + batch_size=cfg.PROPOSAL_GENERATION.BATCH_SIZE, mapper=mapper) evaluator = NullEvaluator() results[dataset_name] = inference_on_dataset(model, data_loader, evaluator) - + if comm.is_main_process(): logger.info("Evaluation results for {} in csv format:".format( dataset_name)) print_csv_format(results[dataset_name]) if len(results) == 1: results = list(results.values())[0] - + return results @@ -110,10 +117,12 @@ def setup(args): logger.info('OUTPUT_DIR: {}'.format(cfg.OUTPUT_DIR)) cfg.freeze() default_setup(cfg, args) - setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="detic") + setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="part_distillation") dataset_name_dir = cfg.PROPOSAL_GENERATION.DATASET_NAME if not cfg.PROPOSAL_GENERATION.DEBUG else "debug" - save_path = f"pseudo_labels/object_labels/detic_predictions/{dataset_name_dir}/" + detic_labeling_mode = cfg.PROPOSAL_GENERATION.DETIC_LABELING_MODE + root_folder_name = cfg.PROPOSAL_GENERATION.ROOT_FOLDER_NAME + save_path = f"{root_folder_name}/object_labels/detic_predictions/{detic_labeling_mode}/{dataset_name_dir}/" # register dataset register_imagenet( cfg.PROPOSAL_GENERATION.DATASET_NAME, @@ -135,7 +144,7 @@ def setup(args): if comm.is_main_process(): if not os.path.exists(save_path): os.makedirs(save_path) - + for fname in metadata.class_codes: folder_path = os.path.join(save_path, fname) if not os.path.exists(folder_path): diff --git a/docs/ANALYSIS.md b/docs/ANALYSIS.md new file mode 100644 index 0000000..4c4cc80 --- /dev/null +++ b/docs/ANALYSIS.md @@ -0,0 +1,37 @@ +# Analyzing Pre-trained Features for Part Segmentation + + +### Visualizing Part Segments by Pixel Grouping +We first explore different pre-trained features and their capability of part segmentation. We provide a demo to try out: + +``` +python part_segment_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/cola.jpg --vocabulary custom --confidence-threshold 0.1 --custom_vocabulary cola --min-image-size 640 --k 4 --weight-name coco_instance_seg --dcrf + +python part_segment_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/cat.jpg --vocabulary custom --confidence-threshold 0.1 --custom_vocabulary cat --min-image-size 640 --k 4 --weight-name coco_instance_seg --dcrf +``` +Above command reads `cat.jpg` image as input, and use [Detic](https://github.com/facebookresearch/Detic/tree/main) to first segment instance of the prompted class (`--custom_vocabulary`, `"cat"` in this case). Then it uses the pre-trained features specified with `--weight-name` to cluster the features to group pixels. +- `--k` is used for the number of clusters. +- `--dcrf` is used for applying [dense-CRF](https://github.com/lucasb-eyer/pydensecrf) as post-processing. +- See [here](https://github.com/facebookresearch/PartDistillation/part_segment_demo.py#L34) to find the available `weight-name` options. Please download the weights from Mask2Former ([here](https://github.com/facebookresearch/Mask2Former/blob/main/MODEL_ZOO.md)) and place them under `./weights/...` (see [here](./WEIGHTS.md)). + +If setup correctly, the result should look like below: + +

+ + +

+ + + +### Evaluating Part Segments on PartImageNet +Here we evaluate the pixel-grouping as part segments on PartImageNet dataset. + +``` +python pixel_grouping_test_net.py --config-file configs/PixelGrouping.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS 4 \ +PIXEL_GROUPING.DISTANCE_METRIC "dot" \ +PIXEL_GROUPING.BACKBONE_FEATURE_KEY_LIST '["res3","res4"]' \ +PIXEL_GROUPING.FEATURE_NORMALIZE False +``` +- *Change settings to explore different configuration.* +- *If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` and use `WANDB.VIS_PERIOD_TEST` to visualize the part segments.* \ No newline at end of file diff --git a/docs/BENCHMARK.md b/docs/BENCHMARK.md new file mode 100644 index 0000000..b12ad91 --- /dev/null +++ b/docs/BENCHMARK.md @@ -0,0 +1,50 @@ +# Benchmark Training and Evaluation + +In our experiments, we compare our method to supervised baselines as well as our *one-stage* baseline. + +### Supervised baseline training +In our experiments, we train supervised baseline for (1) proposal model and (2) part segmentation model. + +#### Supervised Part-proposal Model +Use the following command to train a supervised part proposal model baseline: + +``` +python supervised_train_net.py --config-file configs/SupervisedPartProposalLearning.yaml --num-gpus 8 --num-machines 1 \ +DATASETS.TRAIN '("$DATASET_NAME",)' +``` +- *Change `DATASET_NAME` for training on different datasets.* +- *Change `SUPERVISED_MODEL.USE_PER_PIXEL_LABEL` to `True` for evaluating non-overlapping proposals.* + +#### Supervised Part Segmentation Model +Use the following command to train a supervised part segmentation model baseline: + +``` +python supervised_train_net.py --config-file configs/SupervisedLearning.yaml --num-gpus 8 --num-machines 1 \ +DATASETS.TRAIN '("$DATASET_NAME",)' +``` +- *Change `DATASET_NAME` for training on different datasets.* + +### Fewshot training + +We simply initialize Mask2Former with PartDistillation and train with `$PERCENT` amount of human annotations. Similar to before, we train for *Part Proposals* and *Part Segmentation*. To train for few-shot experiments, use the following command: + +#### Part-proposal Model +``` +python supervised_train_net.py --config-file configs/SupervisedPartProposalLearning.yaml --num-gpus 8 --num-machines 1 \ +MODEL.WEIGHTS /path/to/pretrained/weights/name.pth \ +FEWSHOT_LEARNING.LABEL_PERCENTAGE $PERCENT \ +DATASETS.TRAIN '("${TRAINSET}",)' \ +DATASETS.TRAIN '("${TESTSET}",)' +``` +- *NOTE: Change `TRAINSET` and `TESTSET` for training on different datasets. Change `PERCENT` for different % of human labels.* + +#### Part Segmentation Model +``` +python supervised_train_net.py --config-file configs/SupervisedLearning.yaml --num-gpus 8 --num-machines 1 \ +MODEL.WEIGHTS /path/to/pretrained/weights/name.pth \ +FEWSHOT_LEARNING.LABEL_PERCENTAGE $PERCENT \ +DATASETS.TRAIN '("${TRAINSET}",)' \ +DATASETS.TRAIN '("${TESTSET}",)' +``` +- *NOTE: Change `TRAINSET` and `TESTSET` for training on different datasets. Change `PERCENT` for different % of human labels.* + diff --git a/docs/DATASETS.md b/docs/DATASETS.md new file mode 100644 index 0000000..3434e3a --- /dev/null +++ b/docs/DATASETS.md @@ -0,0 +1,102 @@ +# Prepare datasets for PartDistillation + +For training PartDistillation, we use [ImageNet-22K](https://www.image-net.org/download.php) for original setup and [ImageNet-1K](https://www.image-net.org/download.php) ([huggingface](https://huggingface.co/datasets/imagenet-1k)) for compute-friendly setup. We evaluate our models on [Pascal Parts](http://roozbehm.info/pascal-parts/pascal-parts.html), [PartImageNet](https://github.com/TACJu/PartImageNet), and [Cityscapes Part](https://github.com/mcordts/cityscapesScripts) datasets. Please download these datasets from the official websites and place or sim-link under `$PART_DISTILLATION_ROOT/datasets/`. + +``` +$PART_DISTILLATION_ROOT/datasets/ + imagenet_1k/ + imagenet_22k/ + part_imagenet/ + pascal_parts/ + cityscapes_part/ +``` + + +## ImageNet-1K +For compute-friendly setting, one can train PartDistillation with ImageNet-1K dataset. Please download dataset and place them as following + +``` +imagenet_1k/ + train/ + n01440764 + n01443537 + ... + val/ + n01440764 + n01443537 + ... +``` + + +## ImageNet-21K +Download dataset and place them as following +``` +imagenet_21k/ + synsets.dat + words.txt + ... + n02090622/ + n02090622_10.JPEG + n02090622_100.JPEG + ... + ... + +``` + + +## PartImageNet +Please download PartImageNet from the [original source](https://github.com/TACJu/PartImageNet) and place them as following + +``` +part_imagenet/ + train.json + val.json + test.json + valtest.json # copy from datasets/metadata/part_imagenet_valtest.json + train/ + val/ + n01484850 + ... + test/ + n01491361 + ... + valtest/ + n01484850 # from val/ + ... + n01491361 # from test/ + ... +``` + +*NOTE: `valtest` and `valtest.json` are not provided by the original source. We simply combined `val.json` and `test.json` and provided in `datasets/metadata/`. Please make a new directory `valtest` and simply copy or sim-link folders inside `valtest`, and copy `valtest.json` from `datasets/metadata/part_imagenet_valtest.json`.* + +## Pascal Parts +Pascal Parts dataset uses [the images of Pascal VOC 2012](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) and the annotations of [Pascal Parts](http://roozbehm.info/pascal-parts/pascal-parts.html). Please download them and place as following + +``` +pascal_parts/ + images/ # from Pascal VOC 2012 + ImageSets/ + JPEGImages/ + ... + annotations/ # from Pascal Parts + 2008_000002.mat + 2008_000003.mat + 2008_000007.mat + ... +``` + + +## Cityscapes Part +Cityscapes Part is a panoptic part segmentation dataset but we use instance only. Please download the dataset from [the official source](https://github.com/pmeletis/panoptic_parts) and place as following + +``` +cityscapes_part/ + leftImage8bit/ + gtFinePanopticParts/ + train/ + aachen/ + ... + val/ + frankfurt/ + ... +``` diff --git a/docs/DEMO.md b/docs/DEMO.md new file mode 100644 index 0000000..168d687 --- /dev/null +++ b/docs/DEMO.md @@ -0,0 +1,59 @@ +# PartDistillation DEMO + +Here we provide DEMO for PartDistillation. We use the default [demo interface](https://github.com/facebookresearch/detectron2/blob/main/GETTING_STARTED.md) from detectron2. Please follow [here](WEIGHTS.md) to setup [Detic](https://github.com/facebookresearch/Detic/tree/main), and also download our pre-trained weight [here](https://utexas.box.com/shared/static/ovqrzxm9jwe66l0zjqyofkowk5zvhex1.pth). Please save the weight under `./weights/PartProposalLearning/IN1K+Human/part_proposal_model.pth` (see [here](https://github.com/janghyuncho/PartDistillation/blob/9560d4fd2a79d456c88dd1239b7f9cdc7f5c58d4/part_distillation_demo.py#L114C1-L114C1)). + +With an image of a `person` and a `bicycle`: + +

+ +Use the following command to segment each class: +``` +python part_distillation_demo.py --input figs/input/bicycle_person.jpg --output figs/output/part_proposal/bicycle.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary bicycle --min-image-size 640 --non-overlapping + +python part_distillation_demo.py --input figs/input/bicycle_person.jpg --output figs/output/part_proposal/person.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary person --min-image-size 640 --non-overlapping +``` + +If setup correctly, it should look like this: +

+ + +

+ +- `--custom_vcabulary` to segment different object class. +- `--non-overlapping` flag to generate non-overlapping part segmentation. +- `--dcrf` to further smooth out the boundary. +- `--part-score-threshold` to filter-out low score parts. + +Here are different examples (`cat` and `cola`): +``` +python part_distillation_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/part_proposal/cat.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary cat --min-image-size 640 + +python part_distillation_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/part_proposal/cola.jpg --vocabulary custom --confidence-threshold 0.1 --part-score-threshold 0.3 --custom_vocabulary cola --min-image-size 640 +``` +Here we visualize overlapping part segmentation. If setup correctly, it should look like this: + +

+ + +

+ + +### Visualizing Part Segments by Pixel Grouping +We first explore different pre-trained features and their capability of part segmentation. We provide a demo to try out: + +``` +python part_segment_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/part_segment/cola.jpg --vocabulary custom --confidence-threshold 0.1 --custom_vocabulary cola --min-image-size 640 --k 4 --weight-name coco_instance_seg --dcrf + +python part_segment_demo.py --input figs/input/cat_and_cola.jpg --output figs/output/part_segment/cat.jpg --vocabulary custom --confidence-threshold 0.1 --custom_vocabulary cat --min-image-size 640 --k 4 --weight-name coco_instance_seg --dcrf +``` +Above command reads `cat.jpg` image as input, and use [Detic](https://github.com/facebookresearch/Detic/tree/main) to first segment instance of the prompted class (`--custom_vocabulary`, `"cat"` in this case). Then it uses the pre-trained features specified with `--weight-name` to cluster the features to group pixels. +- `--k` is used for the number of clusters. +- `--dcrf` is used for applying [dense-CRF](https://github.com/lucasb-eyer/pydensecrf) as post-processing. +- See [here](https://github.com/janghyuncho/PartDistillation/blob/main/part_segment_demo.py#L38) to find the available `weight-name` options. Please download the weights from Mask2Former ([here](https://github.com/facebookresearch/Mask2Former/blob/main/MODEL_ZOO.md)) and place them under `./weights/...` (see [here](./WEIGHTS.md)). + +If setup correctly, the result should look like below: + +

+ + +

diff --git a/INSTALL.md b/docs/INSTALL.md similarity index 84% rename from INSTALL.md rename to docs/INSTALL.md index 54c2d80..4982623 100644 --- a/INSTALL.md +++ b/docs/INSTALL.md @@ -22,8 +22,8 @@ cd detectron2 pip install -e . cd .. -git clone git@github.com:fairinternal/ozi_partdiscovery.git # Change it later to public repo. -cd ozi_partdiscovery +git clone git@github.com:facebookresearch/PartDistillation.git # Change it later to public repo. +cd PartDistillation pip install -r requirements.txt cd part_distillation/modeling/pixel_decoder/ops sh make.sh # CUDA_HOME must be defined and points to the directory of the installed CUDA toolkit. @@ -34,11 +34,3 @@ git clone https://github.com/facebookresearch/Detic.git --recurse-submodules cd Detic pip install -r requirements.txt ``` -*For FAIR internal: for compiling MSDeformAttn, do the following* - -``` -cd part_distillation/modeling/pixel_decoder/ops -module unload cuda -module load cuda/11.3 -sh make.sh -``` \ No newline at end of file diff --git a/docs/MODELZOO.md b/docs/MODELZOO.md new file mode 100644 index 0000000..759e69c --- /dev/null +++ b/docs/MODELZOO.md @@ -0,0 +1,71 @@ +# PartDistillation Model Zoo + +## Part Proposal Learning +Part proposal learning models are trained following here: [[1K training](TRAINING_1K.md)], [[21K training (coming soon)]()] + +To use pre-trained model for inference: +``` +python part_proposal_train_net.py --config-file configs/PartProposalLearning.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PROPOSAL_LEARNING.MIN_OBJECT_AREA_RATIO 0.0 \ +PROPOSAL_LEARNING.MIN_AREA_RATIO 0.0 \ +MODEL.WEIGHTS /path/to/model/weights/name.pth \ +OUTPUT_DIR /path/to/output/ +``` +for a part proposal model. +Change `DATASETS.TEST` and `PROPOSAL_LEARNING.POSTPROCESS_TYPES` to evaluate `"prop"` (overlapping part proposals) or `"semseg"` (non-overlapping proposals). If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` to visualize the predictions. See [here](TRAINING_1K.md) for available datasets. + +To evaluate a supervised model: +``` +python supervised_train_net.py --config-file configs/SupervisedPartProposalLearning.yaml --num-gpus 8 --num-machines 1 --eval-only \ +DATASETS.TEST '("$DATASET_NAME",)' \ +SUPERVISED_MODEL.USE_PER_PIXEL_LABEL True \ +MODEL.WEIGHTS /path/to/model/weight/name.pth +``` +Change `DATASET_NAME` for different dataset evaluation and `SUPERVISED_MODEL.USE_PER_PIXEL_LABEL` to evaluate `"prop"` (overlapping part proposals) or `"semseg"` (non-overlapping proposals). If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` to visualize the predictions. See [here](TRAINING_1K.md) for available datasets. + +#### Evaluating Part-proposal Model for mIOU +Below will run clustering for each object-class first and use the cluster centroids as classifiers. +``` +python part_ranking_train_net.py --config-file configs/PartRanking.yaml --num-gpus 8 --num-machines 1 --eval-only \ +DATASETS.TEST '("${DATASET_NAME}_pre_labeling_val","${DATASET_NAME}_match_val","${DATASET_NAME}_evaluate_val",)' +``` +- *Change `DATASET_NAME` for different dataset. See [here](TRAINING_1K.md) for available datasets.* + + +### ImageNet-1K training +| config | prediction type | Pascal Part AR@200 | PartImageNet AR@200 | Download | +|-------------------|:---:|:------:|:-----------------:|:----------:| +|[PartDistillation (first-stage)](../configs/PartProposalLearning.yaml) | overlapping | 27.2 | 52.2 | [model](https://utexas.box.com/shared/static/ovqrzxm9jwe66l0zjqyofkowk5zvhex1.pth) | +|[PartDistillation (first-stage)](../configs/PartProposalLearning.yaml) | non-overlapping | 14.7 | 30.3 | [model](https://utexas.box.com/shared/static/ovqrzxm9jwe66l0zjqyofkowk5zvhex1.pth) | + + +## PartDistillation + +Final PartDistillation models are trained following here:[[1K training](TRAINING_1K.md)], [[21K training (coming soon)]()]. + +To evaluate the pre-trained PartDistillation model: +``` +python part_distillation_train_net.py --config-file configs/PartDistillation.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PART_DISTILLATION.MIN_OBJECT_AREA_RATIO 0.0 \ +PART_DISTILLATION.MIN_AREA_RATIO 0.0 \ +MODEL.WEIGHTS /path/to/model/weights/name.pth \ +OUTPUT_DIR /path/to/output/ +``` +Above commands runs inference for PartDistillation. +Again, change `DATASET_NAME` for evaluating different datasets and `PROPOSAL_LEARNING.POSTPROCESS_TYPES` to evaluate `"prop"` (overlapping part proposals) or `"semseg"` (non-overlapping proposals). If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` to visualize the predictions. See [here](TRAINING_1K.md) for available datasets. + +To evaluate a supervised model: +``` +python supervised_train_net.py --config-file configs/SupervisedLearning.yaml --num-gpus 8 --num-machines 1 --eval-only \ +DATASETS.TEST '("$DATASET_NAME",)' \ +SUPERVISED_MODEL.USE_PER_PIXEL_LABEL True \ +MODEL.WEIGHTS /path/to/model/weight/name.pth +``` +Change `DATASETS.TEST` for different datasets (`pascal_part_val`, `part_imagenet_valtest`, etc.) See [here](TRAINING_1K.md) for available datasets. +If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` to visualize the predictions. + + +### ImageNet-1K training +| config | Pascal Part mIOU | PartImageNet mIOU | Download | +|-----------------------|:--------------:|:-----------:|:-----------------:| +|[PartDistillation (second-stage)](../configs/PartDistillation.yaml) | 22.3 | 46.0 | [model](https://utexas.box.com/shared/static/7651zj8n9ou3rbsmgfhjqobh7voxdnll.pth) | diff --git a/docs/TRAINING.md b/docs/TRAINING.md new file mode 100644 index 0000000..322fb15 --- /dev/null +++ b/docs/TRAINING.md @@ -0,0 +1,59 @@ + +# Training PartDistillation + +PartDistillation has multiple stages to train the full model. +Parts are separated with object segmentation and we use Detic predictions to do the job. +To make the process fast, we save all detic predictions of ImageNet first. + +### Save detic prediction +First, we need to download pretrained detic weight. Download it [here](https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth) and place it in `weights/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth`. +Then, use the pretrained Detic model to precompute object instance segmentation: +``` +./sh_files/detic/run.sh +``` +Above code will launch 60 parallel jobs to run detic and save the result at `pseudo_labels/object_labels/imagenet_22k_train/detic_predictions/`. + + +### Pixel grouping for class-agnostic part segments +Please donwload pretrained mask2former weight [here](https://dl.fbaipublicfiles.com/maskformer/mask2former/coco/instance/maskformer2_swin_large_IN21k_384_bs16_100ep/model_final_e5f453.pkl) and place it in `weights/mask2former/instance/swinL_i21k_q200_e100.pkl`. + +``` +./sh_files/proposal_generation/run.sh +``` + +Above code will launch 40 parallel jobs. Pixel-grouping is good initial segments, but rough. +Need to smooth out with postprocessing. We postprocess all part segments with dense-CRF with the following command. + +``` +./sh_files/dcrf/run.sh +``` +*NOTE: change the number of processes in the submit files to accomodate the resource availability.* + +Then, we start training part proposal model (1st stage), which is a class-agnostic part segmentation model based on Mask2Former. + + +### Part-proposal Learning + +``` +./sh_files/proposal_learning/train_multi.sh +``` + +Above code will train on 4 nodes with 256 batch size. Then, we need to establish global association for each object class. +This allows to produce consistent class label for each part during inference. We call this process *part ranking*. + +### Part Ranking + +``` +./sh_files/part_ranking/run.sh +``` + +This generate part segmentation labels with class (as cluster assignment). With this, we self-train +the entire system all-together. + +### PartDistillation Training + +``` +./sh_files/part_distillation_training/train.sh +``` + +This will launch 4 node job training on entire ImageNet-21K dataset. diff --git a/docs/TRAINING_1K.md b/docs/TRAINING_1K.md new file mode 100644 index 0000000..b9d3820 --- /dev/null +++ b/docs/TRAINING_1K.md @@ -0,0 +1,161 @@ + +# Training **PartDistillation** on ImageNet-1K + +

+ +Since training on the entire ImageNet-21K is too compute-intense, we provide **compute-friendly training** that matches the zero-shot performance reported in the paper. +Here, we train on a single 8-GPU machine each with 24GB memory (NVIDIA RTX A5000) on ImageNet-1K dataset. + + +### Introduction +PartDistillation has multiple stages to train the full model: +- [Preparing object instance segmentation](https://github.com/facebookresearch/PartDistillation/docs/TRAINING_1K.md#save-detic-predictions-of-imagenet-1k-classes) +- [Getting part segments](https://github.com/facebookresearch/PartDistillation/docs/TRAINING_1K.md#pixel-grouping-for-class-agnostic-part-segments) +- [Part-proposal learning](https://github.com/facebookresearch/PartDistillation/docs/TRAINING_1K.md#part-proposal-learning) +- [Part ranking](https://github.com/facebookresearch/PartDistillation/docs/TRAINING_1K.md#part-ranking) +- [PartDistillation](https://github.com/facebookresearch/PartDistillation/docs/TRAINING_1K.md#partdistillation-training) + +#### Weights and Biases +If [Weights and Biases](https://wandb.ai/site) is setup, each command contains options for visualization. See [here](https://github.com/facebookresearch/PartDistillation/part_distillation/config.py#L127) for W&B-related configuration. + +### Dataset Names +For training, we provide the following dataset. + +#### Self-supervised Learning +- `imagenet_1k_train` +- `imagenet_22k_train` + +#### Supervised Learning +- `pascal_part_train` +- `part_imagenet_train` +- `cityscapes_part_train` + +For evaluation: +- `pascal_part_val` +- `part_imagenet_valtest` +- `cityscapes_part_val` + + +### Save detic predictions of ImageNet-1K classes +Parts are separated with object segmentation and we use [Detic](https://github.com/facebookresearch/Detic) predictions to do the job. +This allows to learn and segment parts in scene-centric images with multiple objects. +To expedite the training process, we first save all detic predictions of ImageNet. + +First, we need to download pretrained detic weight (see [here](WEIGHTS.md) for the instruction). +Then, use the pretrained Detic model to precompute object instance segmentation by using `"a [class name]"` as custom text embedding. + +``` +python detic_labeling_net.py --config-file configs/DeticLabeling.yaml --num-gpus 8 --num-machines 1 --eval-only +``` +Above command launches a single job on 8 GPUs and save predictions at `./pseudo_labels` as default. Change `PROPOSAL_GENERATION.ROOT_FOLDER_NAME` to a symlink folder with enough storage (i.e., `ln -s /path/to/storage/ pseudo_labels`). + +#### Save detic predictions of human class +One important difference between ImageNet-21K and ImageNet-1K is that there is no class related to human in 1K. +However, there are plenty of human objects in the background of images and we can still obtain the segmentation by following the command below. + +``` +python detic_labeling_net.py --config-file configs/DeticLabeling.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PROPOSAL_GENERATION.DETIC_LABELING_MODE "human-only" \ +PROPOSAL_GENERATION.SAVE_SCORE_THRESHOLD 0.3 +``` +Above command uses `human, man, woman, toddler, person` as text embedding and store predictions with confidence higher than `0.3`. + +#### Multi-processing +The major bottleneck in saving predictions is writing the files (i.e., CPU). +If enough resource is available, use the following code: +``` +TOTAL_JOBS=10 +for JOB_ID in 0 1 2 3 4 5 6 7 8 9 +do + python multi_node_train_net.py --config-file configs/DeticLabeling.yaml --num-gpus 8 --num-machines 1 --eval-only --target "detic_labeling_net.py" --job-dir "output/detic/" \ + PROPOSAL_GENERATION.PARTITION_INDEX ${JOB_ID} \ + PROPOSAL_GENERATION.TOTAL_PARTTITIONS ${TOTAL_JOBS} +done +``` +Above command splits the dataset into `TOTAL_JOBS=10` and launch multi-node jobs using slurm. If slurm is not available, just launch multiple jobs manually with different `JOB_ID` (indexed from 0 to `TOTAL_JOBS`-1) each time. + + +### Pixel grouping for class-agnostic part segments +From our [initial analysis](ANALYSIS.md), we show that a strong, transformer-based instance segmentation model has strong part-level signals in its feature. To better leverage this, we use a pre-trained Mask2Former model and group the pixel-level features for each object instance mask. + +First, please download a pre-trained Mask2Former weight (see [here](WEIGHTS.md)). Then, use the following commands to save part segments. + +``` +# max-gt-label +python proposal_generation_net.py --config-file configs/ProposalGeneration.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PROPOSAL_GENERATION.DETIC_LABELING_MODE "max-gt-label" \ +PROPOSAL_GENERATION.OBJECT_MASK_PATH "pseudo_labels/object_labels/detic_predictions/max-gt-label/imagenet_1k_train/" + +# human-only +python proposal_generation_net.py --config-file configs/ProposalGeneration.yaml --num-gpus 8 --num-machines 1 --eval-only \ +PROPOSAL_GENERATION.DETIC_LABELING_MODE "human-only" \ +PROPOSAL_GENERATION.OBJECT_MASK_PATH "pseudo_labels/object_labels/detic_predictions/human-only/imagenet_1k_train/" +``` +Above, each command launches a single job on 8 GPUs. If W&B is setup, set `WANDB.DISABLE_WANDB` to `False` and use `WANDB.VIS_PERIOD_TEST` to visualize the generated part segments. For example, `WANDB.VIS_PERIOD_TEST 2000` will visualize every 2000 iterations. +- *NOTE: Similar to above, use `PROPOSAL_GENERATION.PARTITION_INDEX` and `PROPOSAL_GENERATION.TOTAL_PARTTITIONS` for multi-processing.* + + +#### Post-processing part segments. +We post-process the part segments to smooth the boundary with [dense-CRF](https://github.com/lucasb-eyer/pydensecrf). Since this is a CPU-only job, we provide multi-processing job as below: + +``` +TOT_IDS=20 + +# max-gt-label +for ID in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +do + python3 continuously_postprocess_dcrf.py --dataset_name "imagenet_1k_train" --parallel_job_id $ID --num_parallel_jobs $TOT_IDS --res "res3_res4" --num_k 4 --root_folder_name "pseudo_labels" --label_mode 'max-gt-label' & +done + +# human-only +for ID in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +do + python3 continuously_postprocess_dcrf.py --dataset_name "imagenet_1k_train" --parallel_job_id $ID --num_parallel_jobs $TOT_IDS --res "res3_res4" --num_k 4 --root_folder_name "pseudo_labels" --label_mode 'human-only' & +done +``` +Above commands launch 20 parallel CPU jobs each. + + +### Part-proposal Learning +Here, we start training [Part-proposal Learning](../part_distillation/proposal_model.py) (1st stage), which is a class-agnostic part segmentation model. +We re-purpose Mask2Former as a part-segmentation model by training with the generated part segments above as annotations. +Just like two-stage object detector, the model learns to predict *all possible part regions* by training with a large amount of images. +This way, the model learns to: +- (1) sharpen the part segments. +- (2) represent the region as a single vector (*query feature*), similar to box-level feature in two-stage detectors. +- (3) estimate the *likelihood* as a confidence score. + +To train part-proposal model, follow the command below: + +``` +python part_proposal_train_net.py --config-file configs/PartProposalLearning.yaml --num-gpus 8 --num-machines 1 +``` +Above code will train on a single node with 64 batch size. Check out [config](../configs/proposal_learning/swinL_IN21K_384_mask2former.yaml) to see the default setting. Above command uses the [default pseudo-label paths](https://github.com/facebookresearch/PartDistillation/configs/PartProposalLearning.yaml#L21). +After training, move the final weight to `./weights/PartProposalLearning/IN1K+Human/part_proposal_model.pth`. + +### Part Ranking +Then, we associate predicted part proposals across dataset for each object class. +This allows to produce consistent class label for each part during inference. We call this process [Part Ranking](../part_distillation/part_ranking_model.py). +We use the [*part-level features*](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L244) to cluster the predicted parts for each object class. +To further reduce clustering noise, we apply: +- (1) score threshold [before](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L388) and [after](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L336) clustering +- (2) sample pixel-wise maximum part [before](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L370) and [after](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L316) clustering + +To run Part Ranking, use the following command: +``` +python part_ranking_train_net.py --config-file configs/PartRanking.yaml --num-gpus 8 --num-machines 1 --eval-only +``` +Again, you can multi-process this step with `PART_RANKING.TOTAL_PARTITIONS` and `PART_RANKING.PARTITION_INDEX`. If W&B is setup, use `WANDB.VIS_PERIOD_TRAIN` and `WANDB.VIS_PERIOD_TEST` after setting `WANDB.DISABLE_WANDB` to `False`. +- *NOTE: Generated labels are saved under [this directory](https://github.com/facebookresearch/PartDistillation/part_distillation/config.py#L232) (see [here](https://github.com/facebookresearch/PartDistillation/part_distillation/part_ranking_model.py#L95)).* + +### PartDistillation Training +Now, we use self-training to build class-specific part segmentation model. + +``` +python part_distillation_train_net.py --config-file configs/PartDistillation.yaml --num-gpus 8 --num-machines 1 --num-machines 1 +``` + +This command launches a single node job training with 32 batch size. + +### Benchmark Training and Evaluation +For training baselines and evaluation, see [Benchmark Training and Evaluation](BENCHMARK.md) and [Model Zoo](MODELZOO.md). \ No newline at end of file diff --git a/weights/README.md b/docs/WEIGHTS.md similarity index 97% rename from weights/README.md rename to docs/WEIGHTS.md index 2ba6ebf..2f89da7 100644 --- a/weights/README.md +++ b/docs/WEIGHTS.md @@ -7,7 +7,7 @@ For training PartDistillation, we use [Mask2Former pretrained on COCO dataset fo Please place the weights as following ``` $PART_DISTILLATION_ROOT/weights/ - m2f/instance/ + mask2former/instance/ swinL_i21k_q200_e100.pkl detic/ Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 33679d1..0000000 --- a/environment.yml +++ /dev/null @@ -1,283 +0,0 @@ -name: part_distillation -channels: - - pytorch - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - beautifulsoup4=4.11.1=pyha770c72_0 - - blas=1.0=mkl - - brotlipy=0.7.0=py39h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.9.14=ha878542_0 - - certifi=2022.9.14=pyhd8ed1ab_0 - - cffi=1.15.1=py39h74dc2b5_0 - - colorama=0.4.5=pyhd8ed1ab_0 - - cryptography=37.0.1=py39h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.2.2=h20bf706_0 - - freetype=2.11.0=h70c0345_0 - - gdown=4.5.1=pyhd8ed1ab_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h295c915_3 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - joblib=1.1.0=pyhd3eb1b0_0 - - jpeg=9e=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.38=h1181459_1 - - lerc=3.0=h295c915_0 - - libdeflate=1.8=h7f8727e_5 - - libffi=3.3=he6710b0_2 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libidn2=2.3.2=h7f8727e_0 - - libopus=1.3.1=h7b6447c_0 - - libpng=1.6.37=hbc83047_0 - - libprotobuf=3.20.1=h4ff587b_0 - - libstdcxx-ng=11.2.0=h1234567_1 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.4.0=hecacb30_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libvpx=1.7.0=h439df22_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py39h7f8727e_0 - - mkl_fft=1.3.1=py39hd3c417c_0 - - mkl_random=1.2.2=py39h51133e4_0 - - ncurses=6.3=h5eee18b_3 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.23.1=py39ha15fc14_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1q=h7f8727e_0 - - pip=22.1.2=py39h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py39h06a4308_0 - - python=3.9.12=h12debd9_0 - - pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.28.1=py39h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - soupsieve=2.3.2.post1=pyhd8ed1ab_0 - - sqlite=3.39.2=h5082296_0 - - tk=8.6.12=h1ccaba5_0 - - torchaudio=0.11.0=py39_cu113 - - typing_extensions=4.3.0=py39h06a4308_0 - - tzdata=2022a=hda174b7_0 - - urllib3=1.26.11=py39h06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - x264=1!157.20191217=h7b6447c_0 - - xz=5.2.5=h7f8727e_1 - - yaml=0.2.5=h7b6447c_0 - - zlib=1.2.12=h7f8727e_2 - - zstd=1.5.2=ha4553b6_0 - - pip: - - absl-py==1.0.0 - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - albumentations==1.1.0 - - antlr4-python3-runtime==4.8 - - appdirs==1.4.4 - - argon2-cffi==21.3.0 - - argon2-cffi-bindings==21.2.0 - - asttokens==2.0.5 - - async-timeout==4.0.2 - - attrs==21.4.0 - - backcall==0.2.0 - - black==22.3.0 - - bleach==5.0.0 - - blobfile==1.2.7 - - boto3==1.24.61 - - botocore==1.27.61 - - cachetools==5.1.0 - - captum==0.5.0 - - cfgv==3.3.1 - - charset-normalizer==2.0.12 - - cityscapesscripts==2.2.1 - - click==8.1.3 - - clip==1.0 - - cloudpickle==2.1.0 - - coloredlogs==15.0.1 - - cycler==0.11.0 - - cython==0.29.30 - - dataclasses==0.6 - - debugpy==1.6.0 - - decorator==5.1.1 - - defusedxml==0.7.1 - - detectron2==0.6 - - distlib==0.3.4 - - docker-pycreds==0.4.0 - - einops==0.4.1 - - entrypoints==0.4 - - executing==0.8.3 - - fairscale==0.4.6 - - faiss-gpu==1.7.2 - - fastjsonschema==2.15.3 - - fasttext==0.9.2 - - filelock==3.7.0 - - fonttools==4.33.3 - - frozenlist==1.3.0 - - fsspec==2022.5.0 - - ftfy==6.1.1 - - future==0.18.2 - - fvcore==0.1.5.post20220305 - - ghstack==0.6.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - google-auth==2.6.6 - - google-auth-oauthlib==0.4.6 - - grpcio==1.46.3 - - h5py==3.7.0 - - huggingface-hub==0.9.1 - - humanfriendly==10.0 - - hydra-core==1.1.2 - - hydra-submitit-launcher==1.2.0 - - identify==2.5.1 - - imageio==2.19.2 - - importlib-metadata==4.11.4 - - install==1.3.5 - - iopath==0.1.9 - - ipykernel==6.13.0 - - ipython==8.3.0 - - ipython-genutils==0.2.0 - - jedi==0.18.1 - - jinja2==3.1.2 - - jmespath==1.0.1 - - jsonschema==4.5.1 - - jupyter-client==7.3.1 - - jupyter-core==4.10.0 - - jupyterlab-pygments==0.2.2 - - kiwisolver==1.4.2 - - libcst==0.4.3 - - lvis==0.5.3 - - markdown==3.3.7 - - markupsafe==2.1.1 - - matplotlib==3.5.2 - - matplotlib-inline==0.1.3 - - mistune==0.8.4 - - moreorless==0.4.0 - - mss==6.1.0 - - multidict==6.0.2 - - multiscaledeformableattention==1.0 - - mypy-extensions==0.4.3 - - nbclient==0.6.3 - - nbconvert==6.5.0 - - nbformat==5.4.0 - - nest-asyncio==1.5.5 - - networkx==2.8.2 - - nltk==3.7 - - nodeenv==1.6.0 - - notebook==6.4.11 - - numpy==1.22.4 - - oauthlib==3.2.0 - - omegaconf==2.1.2 - - opencv-python==4.5.5.64 - - opencv-python-headless==4.5.5.64 - - packaging==21.3 - - pandas==1.3.5 - - pandocfilters==1.5.0 - - panoptic-parts==2.0rc6 - - parso==0.8.3 - - pathspec==0.9.0 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.1.1 - - platformdirs==2.5.2 - - plotly==5.8.0 - - portalocker==2.4.0 - - pre-commit==2.19.0 - - prometheus-client==0.14.1 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.1 - - ptyprocess==0.7.0 - - pure-eval==0.2.2 - - pyasn1==0.4.8 - - pyasn1-modules==0.2.8 - - pybind11==2.10.0 - - pycocotools==2.0.4 - - pycryptodomex==3.14.1 - - pydensecrf==1.0rc2 - - pydeprecate==0.3.1 - - pydot==1.4.2 - - pygments==2.12.0 - - pyparsing==3.0.9 - - pyquaternion==0.9.9 - - pyrsistent==0.18.1 - - python-dateutil==2.8.2 - - pytorch-lightning==1.5.10 - - pytorch-transformers==1.2.0 - - pytz==2022.1 - - pywavelets==1.3.0 - - pyyaml==6.0 - - pyzmq==23.0.0 - - qudida==0.0.4 - - regex==2022.8.17 - - requests-oauthlib==1.3.1 - - rsa==4.8 - - ruamel-yaml==0.17.21 - - ruamel-yaml-clib==0.2.7 - - s3transfer==0.6.0 - - scikit-image==0.19.2 - - scikit-learn==1.1.1 - - scipy==1.8.1 - - send2trash==1.8.0 - - sentencepiece==0.1.97 - - sentry-sdk==1.5.12 - - setproctitle==1.2.3 - - setuptools==59.5.0 - - shapely==1.8.1.post1 - - shortuuid==1.0.9 - - smmap==5.0.0 - - stack-data==0.2.0 - - stdlibs==2022.3.16 - - submitit==1.4.2 - - tabulate==0.8.9 - - tenacity==8.0.1 - - tensorboard==2.9.0 - - tensorboard-data-server==0.6.1 - - tensorboard-plugin-wit==1.8.1 - - termcolor==1.1.0 - - terminado==0.15.0 - - threadpoolctl==3.1.0 - - tifffile==2022.5.4 - - timm==0.5.4 - - tinycss2==1.1.1 - - tokenizers==0.12.1 - - toml==0.10.2 - - tomli==2.0.1 - - tomlkit==0.11.0 - - torch==1.10.2+cu113 - - torchmetrics==0.8.2 - - torchvision==0.11.3+cu113 - - tornado==6.1 - - tqdm==4.64.0 - - trailrunner==1.2.1 - - traitlets==5.2.1.post0 - - types-termcolor==1.1.4 - - typing==3.7.4.3 - - typing-extensions==3.10.0.2 - - typing-inspect==0.7.1 - - ufmt==1.3.0 - - usort==0.6.4 - - virtualenv==20.14.1 - - wandb==0.12.19 - - wcwidth==0.2.5 - - webencodings==0.5.1 - - werkzeug==2.1.2 - - xmltodict==0.12.0 - - yacs==0.1.8 - - yarl==1.7.2 - - yaspin==2.1.0 - - zipp==3.10.0 -prefix: /private/home/janghyuncho7/anaconda3/envs/part_distillation diff --git a/figs/input/bicycle_person.jpg b/figs/input/bicycle_person.jpg new file mode 100644 index 0000000..8559cad Binary files /dev/null and b/figs/input/bicycle_person.jpg differ diff --git a/figs/input/car.jpg b/figs/input/car.jpg new file mode 100644 index 0000000..9f7aa6e Binary files /dev/null and b/figs/input/car.jpg differ diff --git a/figs/input/cat_and_cola.jpg b/figs/input/cat_and_cola.jpg new file mode 100644 index 0000000..c0a0c94 Binary files /dev/null and b/figs/input/cat_and_cola.jpg differ diff --git a/figs/input/chair.jpg b/figs/input/chair.jpg new file mode 100644 index 0000000..fcd0dce Binary files /dev/null and b/figs/input/chair.jpg differ diff --git a/figs/input/corgi.jpg b/figs/input/corgi.jpg new file mode 100644 index 0000000..45c1684 Binary files /dev/null and b/figs/input/corgi.jpg differ diff --git a/figs/input/horse.jpg b/figs/input/horse.jpg new file mode 100644 index 0000000..fb649df Binary files /dev/null and b/figs/input/horse.jpg differ diff --git a/figs/output/part_proposal/bicycle.jpg b/figs/output/part_proposal/bicycle.jpg new file mode 100644 index 0000000..6423f64 Binary files /dev/null and b/figs/output/part_proposal/bicycle.jpg differ diff --git a/figs/output/part_proposal/car.jpg b/figs/output/part_proposal/car.jpg new file mode 100644 index 0000000..1176575 Binary files /dev/null and b/figs/output/part_proposal/car.jpg differ diff --git a/figs/output/part_proposal/cat.jpg b/figs/output/part_proposal/cat.jpg new file mode 100644 index 0000000..8ad4312 Binary files /dev/null and b/figs/output/part_proposal/cat.jpg differ diff --git a/figs/output/part_proposal/chair.jpg b/figs/output/part_proposal/chair.jpg new file mode 100644 index 0000000..91d3f6a Binary files /dev/null and b/figs/output/part_proposal/chair.jpg differ diff --git a/figs/output/part_proposal/cola.jpg b/figs/output/part_proposal/cola.jpg new file mode 100644 index 0000000..0c1fb39 Binary files /dev/null and b/figs/output/part_proposal/cola.jpg differ diff --git a/figs/output/part_proposal/corgi.jpg b/figs/output/part_proposal/corgi.jpg new file mode 100644 index 0000000..3802c00 Binary files /dev/null and b/figs/output/part_proposal/corgi.jpg differ diff --git a/figs/output/part_proposal/horse.jpg b/figs/output/part_proposal/horse.jpg new file mode 100644 index 0000000..f9584dd Binary files /dev/null and b/figs/output/part_proposal/horse.jpg differ diff --git a/figs/output/part_proposal/person.jpg b/figs/output/part_proposal/person.jpg new file mode 100644 index 0000000..2013f95 Binary files /dev/null and b/figs/output/part_proposal/person.jpg differ diff --git a/figs/output/part_segment/cat.jpg b/figs/output/part_segment/cat.jpg new file mode 100644 index 0000000..21f8bd8 Binary files /dev/null and b/figs/output/part_segment/cat.jpg differ diff --git a/figs/output/part_segment/cola.jpg b/figs/output/part_segment/cola.jpg new file mode 100644 index 0000000..1286e49 Binary files /dev/null and b/figs/output/part_segment/cola.jpg differ diff --git a/make_visualization.py b/make_visualization.py index 6f383ee..2f430e6 100644 --- a/make_visualization.py +++ b/make_visualization.py @@ -3,20 +3,21 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os + +import os import cv2 -import copy -import argparse -import numpy as np -import torch -import matplotlib.pyplot as plt +import copy +import argparse +import numpy as np +import torch +import matplotlib.pyplot as plt from detectron2.data.detection_utils import read_image from pycocotools import mask as coco_mask from detectron2.utils.visualizer import ColorMode, Visualizer, GenericMask from detectron2.structures import Instances from detectron2.data import transforms as T -from PIL import Image, ImageDraw, ImageFont +from PIL import Image, ImageDraw, ImageFont IMAGE_SIZE = 640 @@ -47,13 +48,13 @@ def draw_instance_predictions(self, predictions): def ann_to_instance_dict(data): masks = torch.tensor([coco_mask.decode(ann["segmentation"]) for ann in data["part_masks"]]) label = data["part_labels"] - + instance_dict = {} for msk, lbl in zip(masks, label): instance = Instances(masks.shape[1:]) instance.pred_masks = msk[None] instance.pred_classes = lbl[None] - + instance_dict[lbl.item()] = instance return instance_dict @@ -84,14 +85,14 @@ def get_vis_image(data, instance, opacity=0.9): image = read_image(data["file_name"]) image = T.apply_transform_gens(augs, image)[0] white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) visualizer = Partvisualizer(image) vis_image = visualizer.draw_instance_predictions(predictions=instance).get_image() vis_image = Image.fromarray(vis_image) - + return vis_image - + augs = [T.ResizeScale(min_scale=1.0, max_scale=1.0, target_height=IMAGE_SIZE, target_width=IMAGE_SIZE), T.FixedSizeCrop(crop_size=(IMAGE_SIZE, IMAGE_SIZE)) ] @@ -138,8 +139,8 @@ def get_argparse(): source_root = f"{args.pseudo_root_folder}/part_labels/part_masks_with_class/{args.dataset_name}/{args.mask_ranking_type}/{args.object_mask_type}/{args.model_name}/local_l2_4/masking_step_24/global_l2_{args.num_parts}/" target_root = f"visualization/{args.dataset_name}/{args.mask_ranking_type}/{args.object_mask_type}/{args.model_name}/local_l2_4/masking_step_24/global_l2_{args.num_parts}/" collage_root = f"collages/{args.dataset_name}/{args.mask_ranking_type}/{args.object_mask_type}/{args.model_name}/local_l2_4/masking_step_24/global_l2_{args.num_parts}/" - - # For model predictions. + + # For model predictions. if args.mode == "model_predictions": source_root = f"visualization/{args.dataset_name}/{args.model_name}/" target_root = f"visualization/{args.dataset_name}/overlayed_images/{args.model_name}/" @@ -158,8 +159,8 @@ def get_argparse(): if args.num_parallel_jobs > 0: num_total_classes = len(code_list) num_classes_per_job = num_total_classes // args.num_parallel_jobs - num_remaining_classes = num_total_classes - args.num_parallel_jobs * num_classes_per_job - num_current_job_classes = num_classes_per_job + num_remaining_classes = num_total_classes - args.num_parallel_jobs * num_classes_per_job + num_current_job_classes = num_classes_per_job start_i = num_current_job_classes * (args.parallel_job_id-1) end_i = num_current_job_classes * args.parallel_job_id @@ -176,12 +177,12 @@ def get_argparse(): folder_name = code + "_" + fname_to_classname[code] pname_list = os.listdir(os.path.join(target_root, folder_name)) for pname in pname_list: - pathlist = [] + pathlist = [] count = 0 collage_id = 0 for fname in os.listdir(os.path.join(target_root, folder_name, pname)): pathlist.append(os.path.join(target_root, folder_name, pname, fname)) - count += 1 + count += 1 if count % args.collage_size**2 == 0: collage = make_collage(args.collage_size, pathlist) @@ -190,18 +191,18 @@ def get_argparse(): collage.save(os.path.join(collage_root, "collage_{}x{}".format(args.collage_size, args.collage_size), folder_name, pname, fname)) pathlist = [] collage_id += 1 - + if args.collage_limit > 0 and args.collage_limit < collage_id: - break + break if progress_count % 5 == 0: - print('{:.2f} \% done.'.format(progress_count/len(code_list) * 100), flush=True) + print('{:.2f} \% done.'.format(progress_count/len(code_list) * 100), flush=True) else: - pathlist = [] + pathlist = [] count = 0 collage_id = 0 for fname in os.listdir("debug_vis"): pathlist.append(os.path.join("debug_vis", fname)) - count += 1 + count += 1 if count % args.collage_size**2 == 0: collage = make_collage(args.collage_size, pathlist) @@ -229,7 +230,7 @@ def get_argparse(): for part_id, instance in instance_dict.items(): if not args.debug and not os.path.exists(os.path.join(target_root, folder_name, "part_{}".format(part_id))): os.makedirs(os.path.join(target_root, folder_name, "part_{}".format(part_id))) - + if not os.path.exists(os.path.join(target_root, folder_name, "part_{}".format(part_id), fname)): vis_image = get_vis_image(data, instance, 0.7) debug_count += 1 @@ -243,5 +244,7 @@ def get_argparse(): # print("Saved.", os.path.join(target_root, folder_name, "part_{}".format(part_id), fname)) if count % 10 == 0: - print('{:.2f} \% done.'.format(count/len(code_list) * 100), flush=True) - print("Done. ") + print('{:.2f} \% done.'.format(count/len(code_list) * 100), flush=True) + print("Done. ") + + diff --git a/multi_node_train_net.py b/multi_node_train_net.py index 90a8500..f3cd2bb 100644 --- a/multi_node_train_net.py +++ b/multi_node_train_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + #!/usr/bin/env python import argparse import os @@ -231,7 +232,7 @@ def main(): timeout_min = args.timeout kwargs = {} if args.use_volta32: - # "constraint" is deprecated. + # "constraint" is deprecated. kwargs["slurm_constraint"] = "volta32gb" if args.comment: kwargs["comment"] = args.comment @@ -260,4 +261,4 @@ def main(): print(f"[launcher] Submitted job_id: {job.job_id}, dir: {job.paths.folder}") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/part_distillation/__init__.py b/part_distillation/__init__.py index 9d43933..fd0ebed 100644 --- a/part_distillation/__init__.py +++ b/part_distillation/__init__.py @@ -3,10 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import data + +from . import data from . import modeling -from .config import (add_maskformer2_config, add_proposal_learning_config, add_wandb_config, - add_custom_datasets_config, add_proposal_generation_config, add_part_ranking_config, +from .config import (add_maskformer2_config, add_proposal_learning_config, add_wandb_config, + add_custom_datasets_config, add_proposal_generation_config, add_part_ranking_config, add_part_distillation_config, add_pixel_grouping_confing, add_supervised_model_config, add_fewshot_learning_config) from .proposal_model import ProposalModel diff --git a/part_distillation/config.py b/part_distillation/config.py index e7de3a5..a8a1d39 100644 --- a/part_distillation/config.py +++ b/part_distillation/config.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + # -*- coding: utf-8 -*- from detectron2.config import CfgNode as CN @@ -24,7 +25,7 @@ def add_maskformer2_config(cfg): cfg.INPUT.SIZE_DIVISIBILITY = -1 cfg.INPUT.IMAGE_SIZE_BASE = 640 - # solver config + # solver config # weight decay on embedding cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 # optimizer @@ -110,7 +111,7 @@ def add_maskformer2_config(cfg): # point loss configs # Number of points sampled during training for a mask point head. - cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 + cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112 cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS_MATCH = 112 * 112 cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS_LOSS = 112 * 112 # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the @@ -122,13 +123,17 @@ def add_maskformer2_config(cfg): # NOTE: Added config for PartDistillation. cfg.MODEL.MASK_FORMER.FREEZE_KEYS = [] - cfg.MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE = False + cfg.MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE = False + + # fp16 + cfg.FP16 = False + cfg.USE_CHECKPOINT = False def add_wandb_config(cfg): - cfg.WANDB = CN() - cfg.WANDB.DISABLE_WANDB = False - cfg.WANDB.GROUP = None + cfg.WANDB = CN() + cfg.WANDB.DISABLE_WANDB = False + cfg.WANDB.GROUP = None cfg.WANDB.PROJECT = "" cfg.WANDB.VIS_PERIOD_TRAIN = 200 cfg.WANDB.VIS_PERIOD_TEST = 20 @@ -140,20 +145,20 @@ def add_wandb_config(cfg): def add_proposal_learning_config(cfg): - cfg.PROPOSAL_LEARNING = CN() - cfg.PROPOSAL_LEARNING.MIN_OBJECT_AREA_RATIO = 0.001 + cfg.PROPOSAL_LEARNING = CN() + cfg.PROPOSAL_LEARNING.MIN_OBJECT_AREA_RATIO = 0.001 cfg.PROPOSAL_LEARNING.MIN_AREA_RATIO = 0.0 cfg.PROPOSAL_LEARNING.MIN_SCORE = -1.0 cfg.PROPOSAL_LEARNING.DATASET_PATH_LIST = [] cfg.PROPOSAL_LEARNING.FILTERED_CODE_PATH_LIST = [] cfg.PROPOSAL_LEARNING.EXCLUDE_CODE_PATH = "" - cfg.PROPOSAL_LEARNING.PATH_ONLY = False + cfg.PROPOSAL_LEARNING.PATH_ONLY = False cfg.PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL = True cfg.PROPOSAL_LEARNING.DATASET_PATH = "" - cfg.PROPOSAL_LEARNING.LABEL_PERCENTAGE = 100 - cfg.PROPOSAL_LEARNING.APPLY_MASKING_WITH_OBJECT_MASK = True + cfg.PROPOSAL_LEARNING.LABEL_PERCENTAGE = 100 + cfg.PROPOSAL_LEARNING.APPLY_MASKING_WITH_OBJECT_MASK = True cfg.PROPOSAL_LEARNING.POSTPROCESS_TYPES = [] - cfg.PROPOSAL_LEARNING.DEBUG = False + cfg.PROPOSAL_LEARNING.DEBUG = False @@ -162,99 +167,105 @@ def add_custom_datasets_config(cfg): cfg.CUSTOM_DATASETS = CN() cfg.CUSTOM_DATASETS.BASE_SIZE = -1 cfg.CUSTOM_DATASETS.AUG_NAME_LIST = [] - cfg.CUSTOM_DATASETS.USE_MERGED_GT = True + cfg.CUSTOM_DATASETS.USE_MERGED_GT = True cfg.CUSTOM_DATASETS.LABEL_PERCENTAGE = 100 cfg.CUSTOM_DATASETS.PASCAL_PARTS = CN() cfg.CUSTOM_DATASETS.PASCAL_PARTS.IMAGES_DIRNAME = "" cfg.CUSTOM_DATASETS.PASCAL_PARTS.ANNOTATIONS_DIRNAME = "" cfg.CUSTOM_DATASETS.PASCAL_PARTS.SUBSET_CLASS_NAMES = [] - cfg.CUSTOM_DATASETS.PASCAL_PARTS.DEBUG = False + cfg.CUSTOM_DATASETS.PASCAL_PARTS.DEBUG = False cfg.CUSTOM_DATASETS.CITYSCAPES_PART = CN() - cfg.CUSTOM_DATASETS.CITYSCAPES_PART.IMAGES_DIRNAME = "" - cfg.CUSTOM_DATASETS.CITYSCAPES_PART.ANNOTATIONS_DIRNAME = "" - cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY = False - cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG = False + cfg.CUSTOM_DATASETS.CITYSCAPES_PART.IMAGES_DIRNAME = "" + cfg.CUSTOM_DATASETS.CITYSCAPES_PART.ANNOTATIONS_DIRNAME = "" + cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY = False + cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG = False cfg.CUSTOM_DATASETS.PART_IMAGENET = CN() - cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME = "" - cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME = "" - cfg.CUSTOM_DATASETS.PART_IMAGENET.DEBUG = False - + cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME = "" + cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME = "" + cfg.CUSTOM_DATASETS.PART_IMAGENET.DEBUG = False + def add_proposal_generation_config(cfg): - cfg.PROPOSAL_GENERATION = CN() + cfg.PROPOSAL_GENERATION = CN() cfg.PROPOSAL_GENERATION.DATASET_NAME = "imagenet_22k_train" cfg.PROPOSAL_GENERATION.OBJECT_MASK_TYPE = "detic" cfg.PROPOSAL_GENERATION.OBJECT_MASK_PATH = "pseudo_labels/object_labels/imagenet_22k_train/detic_predictions/" - cfg.PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS = 4 + cfg.PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS = 4 cfg.PROPOSAL_GENERATION.DISTANCE_METRIC = "l2" cfg.PROPOSAL_GENERATION.FEATURE_NORMALIZE = False cfg.PROPOSAL_GENERATION.BACKBONE_FEATURE_KEY_LIST = ["res4"] cfg.PROPOSAL_GENERATION.TOTAL_PARTITIONS = -1 - cfg.PROPOSAL_GENERATION.PARTITION_INDEX = -1 + cfg.PROPOSAL_GENERATION.PARTITION_INDEX = -1 cfg.PROPOSAL_GENERATION.BATCH_SIZE = 4 - cfg.PROPOSAL_GENERATION.WITH_GIVEN_MASK = False - cfg.PROPOSAL_GENERATION.USE_PART_IMAGENET_CLASSES = False + cfg.PROPOSAL_GENERATION.WITH_GIVEN_MASK = False + cfg.PROPOSAL_GENERATION.USE_PART_IMAGENET_CLASSES = False cfg.PROPOSAL_GENERATION.FILTERED_CODE_PATH_LIST = [] cfg.PROPOSAL_GENERATION.EXCLUDE_CODE_PATH = "" cfg.PROPOSAL_GENERATION.SINGLE_CLASS_CODE = "" - cfg.PROPOSAL_GENERATION.DEBUG = False - - + cfg.PROPOSAL_GENERATION.ROOT_FOLDER_NAME = "pseudo_labels" + cfg.PROPOSAL_GENERATION.DETIC_LABELING_MODE = "max-gt-label" # "max-gt-label" or "human-only" + cfg.PROPOSAL_GENERATION.SAVE_SCORE_THRESHOLD = 0.0 + cfg.PROPOSAL_GENERATION.DEBUG = False + + def add_part_ranking_config(cfg): - cfg.PART_RANKING = CN() + cfg.PART_RANKING = CN() cfg.PART_RANKING.DATASET_PATH = "" cfg.PART_RANKING.DATASET_PATH_LIST = [] cfg.PART_RANKING.FILTERED_CODE_PATH_LIST = [] cfg.PART_RANKING.EXCLUDE_CODE_PATH = "" - cfg.PART_RANKING.PATH_ONLY = False + cfg.PART_RANKING.PATH_ONLY = False cfg.PART_RANKING.NUM_CLUSTERS = 8 cfg.PART_RANKING.CLASSIFIER_METRIC = "l2" cfg.PART_RANKING.PROPOSAL_KEY = "decoder_output" - cfg.PART_RANKING.PROPOSAL_FEATURE_NORM = True - cfg.PART_RANKING.MIN_OBJECT_AREA_RATIO = 0.001 + cfg.PART_RANKING.PROPOSAL_FEATURE_NORM = True + cfg.PART_RANKING.MIN_OBJECT_AREA_RATIO = 0.001 cfg.PART_RANKING.MIN_AREA_RATIO_1 = 0.0 cfg.PART_RANKING.MIN_AREA_RATIO_2 = 0.0 cfg.PART_RANKING.MIN_SCORE_1 = 0.0 cfg.PART_RANKING.MIN_SCORE_2 = 0.0 - cfg.PART_RANKING.USE_PER_PIXEL_LABEL_DURING_CLUSTERING = True + cfg.PART_RANKING.USE_PER_PIXEL_LABEL_DURING_CLUSTERING = True cfg.PART_RANKING.USE_PER_PIXEL_LABEL_DURING_LABELING = True - cfg.PART_RANKING.APPLY_MASKING_WITH_OBJECT_MASK = True + cfg.PART_RANKING.APPLY_MASKING_WITH_OBJECT_MASK = True cfg.PART_RANKING.TOTAL_PARTITIONS = -1 - cfg.PART_RANKING.PARTITION_INDEX = -1 - cfg.PART_RANKING.DEBUG = False + cfg.PART_RANKING.PARTITION_INDEX = -1 + cfg.PART_RANKING.ROOT_FOLDER_NAME = "pseudo_labels" + cfg.PART_RANKING.WEIGHT_NAME = "default" + cfg.PART_RANKING.SAVE_ANNOTATIONS = False + cfg.PART_RANKING.DEBUG = False def add_part_distillation_config(cfg): - cfg.PART_DISTILLATION = CN() + cfg.PART_DISTILLATION = CN() cfg.PART_DISTILLATION.DATASET_PATH = "" cfg.PART_DISTILLATION.DATASET_PATH_LIST = [] cfg.PART_DISTILLATION.FILTERED_CODE_PATH_LIST = [] cfg.PART_DISTILLATION.EXCLUDE_CODE_PATH = "" - cfg.PART_DISTILLATION.PATH_ONLY = False + cfg.PART_DISTILLATION.PATH_ONLY = False cfg.PART_DISTILLATION.USE_PER_PIXEL_LABEL = True cfg.PART_DISTILLATION.NUM_PART_CLASSES = 8 cfg.PART_DISTILLATION.NUM_OBJECT_CLASSES = 1000 # ImageNet-1K - cfg.PART_DISTILLATION.MIN_OBJECT_AREA_RATIO = 0.001 + cfg.PART_DISTILLATION.MIN_OBJECT_AREA_RATIO = 0.001 cfg.PART_DISTILLATION.MIN_AREA_RATIO = -1.0 cfg.PART_DISTILLATION.MIN_SCORE = -1.0 - cfg.PART_DISTILLATION.USE_ORACLE_CLASSIFIER = False - cfg.PART_DISTILLATION.APPLY_MASKING_WITH_OBJECT_MASK = True + cfg.PART_DISTILLATION.USE_ORACLE_CLASSIFIER = False + cfg.PART_DISTILLATION.APPLY_MASKING_WITH_OBJECT_MASK = True cfg.PART_DISTILLATION.TOTAL_PARTITIONS = -1 - cfg.PART_DISTILLATION.PARTITION_INDEX = -1 - cfg.PART_DISTILLATION.SET_IMAGE_SQUARE = False - cfg.PART_DISTILLATION.DEBUG = False + cfg.PART_DISTILLATION.PARTITION_INDEX = -1 + cfg.PART_DISTILLATION.SET_IMAGE_SQUARE = False + cfg.PART_DISTILLATION.DEBUG = False def add_pixel_grouping_confing(cfg): cfg.PIXEL_GROUPING = CN() - cfg.PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS = 4 + cfg.PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS = 4 cfg.PIXEL_GROUPING.DISTANCE_METRIC = "l2" cfg.PIXEL_GROUPING.BACKBONE_FEATURE_KEY_LIST = ["res4"] cfg.PIXEL_GROUPING.FEATURE_NORMALIZE = False @@ -264,13 +275,13 @@ def add_pixel_grouping_confing(cfg): def add_supervised_model_config(cfg): - cfg.SUPERVISED_MODEL = CN() - cfg.SUPERVISED_MODEL.USE_PER_PIXEL_LABEL = False - cfg.SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK = True - cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING = False - cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE = False + cfg.SUPERVISED_MODEL = CN() + cfg.SUPERVISED_MODEL.USE_PER_PIXEL_LABEL = False + cfg.SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK = True + cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING = False + cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE = False def add_fewshot_learning_config(cfg): - cfg.FEWSHOT_LEARNING = CN() - cfg.FEWSHOT_LEARNING.LABEL_PERCENTAGE = 100 + cfg.FEWSHOT_LEARNING = CN() + cfg.FEWSHOT_LEARNING.LABEL_PERCENTAGE = 100 \ No newline at end of file diff --git a/part_distillation/data/__init__.py b/part_distillation/data/__init__.py index f7ae7b7..5ebe885 100644 --- a/part_distillation/data/__init__.py +++ b/part_distillation/data/__init__.py @@ -1,6 +1 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from . import datasets +from . import datasets diff --git a/part_distillation/data/dataset_mappers/cityscapes_part_mapper.py b/part_distillation/data/dataset_mappers/cityscapes_part_mapper.py index c9b25a4..c7153bc 100644 --- a/part_distillation/data/dataset_mappers/cityscapes_part_mapper.py +++ b/part_distillation/data/dataset_mappers/cityscapes_part_mapper.py @@ -3,14 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch import pycocotools.mask as mask_util -from PIL import Image +from PIL import Image from typing import Any, Dict, List, Set, Tuple from detectron2.config import configurable from detectron2.data import detection_utils as utils @@ -39,11 +40,11 @@ def load_object_and_parts(dict, file_path): """ - Object classes: 24, 25, 26, 27, 28 (5 classes). - - Object class starts from 24 and ends with 28. - Part classes: 15 + 8 = 23. - - Part label starts from 1, and ends with either 4 or 5. - - -1 is ignore, and 0 is unlabeled/void. + Object classes: 24, 25, 26, 27, 28 (5 classes). + - Object class starts from 24 and ends with 28. + Part classes: 15 + 8 = 23. + - Part label starts from 1, and ends with either 4 or 5. + - -1 is ignore, and 0 is unlabeled/void. """ instances = utils.annotations_to_instances(dict["annotations"], (dict["height"], dict["width"]), mask_format="bitmask") if hasattr(instances, "gt_masks"): @@ -53,9 +54,9 @@ def load_object_and_parts(dict, file_path): img = np.array(Image.open(file_path)) sids, iids, pids = pp.decode_uids(img) - + object_instances = [] - part_instances = [] + part_instances = [] for instance_id, object_category_id in enumerate(obj_classes): object_category_id = object_category_id.item() object_dict = {"object_category": OBJECT_CLASSES[object_category_id], @@ -70,10 +71,10 @@ def load_object_and_parts(dict, file_path): part_instances_per_object = [] for _pid in np.unique(part_map): # ignore -1 and 0. - if _pid > 0: + if _pid > 0: part_id = PART_BASE_ID[object_category_id] + _pid-1 part_dict = {"part_category": PART_CLASSES[part_id], - "part_category_id": part_id, # shifting to make it 0 start. + "part_category_id": part_id, # shifting to make it 0 start. "category_id": part_id, # For histogram printing. "object_index": instance_id, "segmentation": mask_util.encode(np.asfortranarray(np.where(part_map==_pid, True, False))), @@ -81,7 +82,7 @@ def load_object_and_parts(dict, file_path): part_instances_per_object.append(part_dict) object_instances.append(object_dict) part_instances.append(part_instances_per_object) - + return object_instances, part_instances else: return None, None @@ -100,7 +101,7 @@ def __init__( aug_without_crop, image_format, size_divisibility, - instance_mask_format: str = "bitmask", + instance_mask_format: str = "bitmask", use_merged_gt: bool=False, ): """ @@ -118,7 +119,7 @@ def __init__( self.instance_mask_format = instance_mask_format self.num_repeats = 20 # number of repeats until give up. self.use_merged_gt = use_merged_gt - + @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation @@ -177,17 +178,17 @@ def _forward_with_aug(self, _dataset_dict, aug): try: self._transform_part_annotations(dataset_dict, transforms, image_shape) except: - return None + return None # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) - + del dataset_dict["annotations"] del dataset_dict["part_annotations"] return dataset_dict - + def __call__(self, _dataset_dict): @@ -203,12 +204,12 @@ def __call__(self, _dataset_dict): object_instances, part_instances = load_object_and_parts(dict, part_file) if object_instances is not None: - dict["annotations"] = object_instances + dict["annotations"] = object_instances dict["part_annotations"] = part_instances - _dataset_dict = dict + _dataset_dict = dict else: - return None + return None if self.is_train: for _ in range(self.num_repeats): @@ -216,24 +217,24 @@ def __call__(self, _dataset_dict): if dataset_dict is not None \ and "part_instances" in dataset_dict \ and dataset_dict["part_instances"].has("gt_masks") \ - and len(dataset_dict["part_instances"]) > 0: - return dataset_dict + and len(dataset_dict["part_instances"]) > 0: + return dataset_dict return self._forward_with_aug(_dataset_dict, self.aug_without_crop) else: return self._forward_with_aug(_dataset_dict, self.aug) - def _transform_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): annos = [ utils.transform_instance_annotations(obj, transforms, image_shape, keypoint_hflip_indices=False) for obj in dataset_dict["annotations"] if obj.get("iscrowd", 0) == 0 ] - instances = utils.annotations_to_instances(annos, image_shape, + instances = utils.annotations_to_instances(annos, image_shape, mask_format=self.instance_mask_format) obj_mapping = [obj_id for obj_id, obj in enumerate(dataset_dict["annotations"])] instances.obj_mapping = torch.tensor(obj_mapping, dtype=torch.int64) @@ -241,9 +242,9 @@ def _transform_annotations(self, dataset_dict["instances"] = utils.filter_empty_instances(instances, by_box=False) - def _transform_part_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_part_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): parts_list = [ part_ann @@ -258,8 +259,8 @@ def _transform_part_annotations(self, for part_per_obj in parts_list: for part in part_per_obj: part["bbox"] = [0, 0, image_shape[0], image_shape[1]] - part["bbox_mode"] = BoxMode.XYXY_ABS - + part["bbox_mode"] = BoxMode.XYXY_ABS + # The list of lists of parts will be flattened below, get mapping between a # part in the flat list and the object it corresponds to. obj_mapping = [obj_id for obj_id, obj in enumerate(parts_list) for _ in obj] @@ -287,7 +288,7 @@ def _transform_part_annotations(self, [i for i, _ in enumerate(flat_part_segs)], dtype=torch.int64 ) instances = utils.filter_empty_instances(instances, by_box=False) - + # save original part masks for evaluation dataset_dict["orig_part_maps"] = [ parts diff --git a/part_distillation/data/dataset_mappers/imagenet_part_ranking_dataset_mapper.py b/part_distillation/data/dataset_mappers/imagenet_part_ranking_dataset_mapper.py index fc9565e..ec6bad4 100644 --- a/part_distillation/data/dataset_mappers/imagenet_part_ranking_dataset_mapper.py +++ b/part_distillation/data/dataset_mappers/imagenet_part_ranking_dataset_mapper.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch from detectron2.config import configurable @@ -24,24 +25,24 @@ def __init__( base_aug, aug, class_code_to_class_index, - instance_mask_format: str = "bitmask", + instance_mask_format: str = "bitmask", ): - self.base_aug = base_aug - self.aug = aug + self.base_aug = base_aug + self.aug = aug self.img_format = image_format self.class_code_to_class_index = class_code_to_class_index self.instance_mask_format = instance_mask_format - + @classmethod def from_config(cls, cfg, class_code_to_class_index): image_size = cfg.INPUT.IMAGE_SIZE - - # Need to resize to match GT. + + # Need to resize to match GT. base_aug = [T.ResizeScale( min_scale=1.0, max_scale=1.0, target_height=image_size, target_width=image_size), ] - aug = [T.FixedSizeCrop(crop_size=(image_size, image_size))] - + aug = [T.FixedSizeCrop(crop_size=(image_size, image_size))] + ret = { "base_aug": base_aug, "aug": aug, @@ -55,8 +56,8 @@ def from_config(cls, cfg, class_code_to_class_index): def __call__(self, dataset_dict): - image_original = utils.read_image(dataset_dict["file_name"], format=self.img_format) - image, _ = T.apply_transform_gens(self.base_aug, image_original) + image_original = utils.read_image(dataset_dict["file_name"], format=self.img_format) + image, _ = T.apply_transform_gens(self.base_aug, image_original) image_shape1 = image.shape image, transforms = T.apply_transform_gens(self.aug, image) @@ -81,15 +82,15 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): parts_list = dataset_dict["pseudo_annotations"] class_code = dataset_dict["class_code"] - # NOTE: We do not use these information for pseudo label, but - # to make the below functions happy we need them. + # NOTE: We do not use these information for pseudo label, but + # to make the below functions happy we need them. # NOTE: set "by_box=False" for filtering empty instances !!! for part in parts_list: part["bbox"] = [0, 0, image_shape[0], image_shape[1]] part["bbox_mode"] = BoxMode.XYXY_ABS if "category_id" not in part: part["category_id"] = -1 - + # Get flat list of annotations. annos = [utils.transform_instance_annotations( part, @@ -97,11 +98,17 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): image_shape, ) for part in parts_list] + # NOTE: detectron2 pads 255 instead of 0 so make sure padding is correct. + if annos[0]['segmentation'].dtype == np.uint8: + masks = torch.tensor([_['segmentation'] for _ in annos]) + for ann in annos: + ann['segmentation'][ann['segmentation']==255] = 0 + # Convert to instances. instances = utils.annotations_to_instances( annos, image_shape, mask_format=self.instance_mask_format ) - + if hasattr(instances, 'gt_masks'): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = utils.filter_empty_instances(instances, by_box=False) @@ -111,3 +118,5 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): new_instances.set("gt_masks", BitMasks(instances.gt_masks.tensor.sum(0)[None])) new_instances.set("gt_classes", torch.tensor([self.class_code_to_class_index[class_code]])) dataset_dict["instances"] = new_instances + + diff --git a/part_distillation/data/dataset_mappers/part_distillation_dataset_mapper.py b/part_distillation/data/dataset_mappers/part_distillation_dataset_mapper.py index d7d7b65..779da9f 100644 --- a/part_distillation/data/dataset_mappers/part_distillation_dataset_mapper.py +++ b/part_distillation/data/dataset_mappers/part_distillation_dataset_mapper.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch from typing import Tuple, Union, Any, List, Set @@ -28,25 +29,25 @@ def __init__( base_aug, augs, weak_augs, - instance_mask_format: str="bitmask", + instance_mask_format: str="bitmask", min_object_area_ratio: float=-1.0, min_area_ratio: float=-1.0, - min_score: float=-1.0, + min_score: float=-1.0, class_code_to_class_id: dict={}, ): - self.base_aug = base_aug + self.base_aug = base_aug self.augs = augs self.weak_augs = weak_augs self.img_format = image_format self.instance_mask_format = instance_mask_format self.num_repeats = 100 # number of repeats until give up. - self.logger = logging.getLogger("part_distillation") + self.logger = logging.getLogger("part_distillation") self.min_object_area_ratio = min_object_area_ratio self.min_area_ratio = min_area_ratio - self.min_score = min_score + self.min_score = min_score self.class_code_to_class_id = class_code_to_class_id - + @classmethod def from_config(cls, cfg, is_train=True): @@ -54,16 +55,16 @@ def from_config(cls, cfg, is_train=True): image_size = cfg.INPUT.IMAGE_SIZE aug_name_list = cfg.CUSTOM_DATASETS.AUG_NAME_LIST set_image_square = cfg.PART_DISTILLATION.SET_IMAGE_SQUARE - - # Need to resize to match GT. + + # Need to resize to match GT. base_aug = [T.ResizeScale( min_scale=1.0, max_scale=1.0, target_height=base_size, target_width=base_size ),] - + if set_image_square: - # Fixing label bug from earlier ... - # some annotations are already in square format. - # TODO: remove when the bug is fixed. + # Fixing label bug from earlier ... + # some annotations are already in square format. + # TODO: remove when the bug is fixed. base_aug.append(T.FixedSizeCrop(crop_size=(base_size, base_size))) augs, weak_augs = [], [] @@ -74,13 +75,13 @@ def from_config(cls, cfg, is_train=True): augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) weak_augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) if "rotation_45" in aug_name_list: - augs.append(T.RandomRotation((0, 45))) + augs.append(T.RandomRotation((0, 45))) if "rotation_90" in aug_name_list: - augs.append(T.RandomRotation((0, 90))) + augs.append(T.RandomRotation((0, 90))) if "rotation_180" in aug_name_list: - augs.append(T.RandomRotation((0, 180))) + augs.append(T.RandomRotation((0, 180))) if "rotation" in aug_name_list: - augs.append(T.RandomRotation((0, 360))) + augs.append(T.RandomRotation((0, 360))) if "crop" in aug_name_list: augs.append(T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, @@ -88,27 +89,27 @@ def from_config(cls, cfg, is_train=True): if "scale" in aug_name_list: min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE - augs.extend([T.ResizeScale(min_scale=min_scale, - max_scale=max_scale, - target_height=image_size, + augs.extend([T.ResizeScale(min_scale=min_scale, + max_scale=max_scale, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) else: - # No resizing but pad to make it square shape. - augs.extend([T.ResizeScale(min_scale=1.0, - max_scale=1.0, - target_height=image_size, + # No resizing but pad to make it square shape. + augs.extend([T.ResizeScale(min_scale=1.0, + max_scale=1.0, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) - weak_augs.extend([T.ResizeScale(min_scale=1.0, - max_scale=1.0, - target_height=image_size, + weak_augs.extend([T.ResizeScale(min_scale=1.0, + max_scale=1.0, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) - test_aug = [] + test_aug = [] class_code_to_class_id = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).class_code_to_class_id ret = { @@ -118,47 +119,48 @@ def from_config(cls, cfg, is_train=True): "image_format": cfg.INPUT.FORMAT, "instance_mask_format": cfg.INPUT.MASK_FORMAT, "class_code_to_class_id": class_code_to_class_id, - "min_object_area_ratio": cfg.PART_DISTILLATION.MIN_OBJECT_AREA_RATIO, - "min_area_ratio": cfg.PART_DISTILLATION.MIN_AREA_RATIO, + "min_object_area_ratio": cfg.PART_DISTILLATION.MIN_OBJECT_AREA_RATIO, + "min_area_ratio": cfg.PART_DISTILLATION.MIN_AREA_RATIO, "min_score": cfg.PART_DISTILLATION.MIN_SCORE, } return ret - + # If dataset is registered with [path_only] flag. def load_annotation(self, path_tuple): dataset_path, fname, ann_name = path_tuple - try: + try: + # print(0, dataset_path, flush=True) # NOTE: old annotations are saved before allocating to cpu, so map to cpu when loading. - ann_dict = torch.load(os.path.join(dataset_path, fname, ann_name), "cpu") + ann_dict = torch.load(os.path.join(dataset_path, fname, ann_name)) except: self.logger.info("{} is corrupted.".format(os.path.join(dataset_path, fname, ann_name))) - return - - # filter object size - if ann_dict["object_ratio"] >= self.min_object_area_ratio: + return + + # filter object size + if ann_dict["object_ratio"] > self.min_object_area_ratio: new_dict = {"file_name": ann_dict["file_name"], "image_id": ann_dict["image_id"], "class_code": fname, "height": None, "width": None, - "pseudo_annotations": [], + "pseudo_annotations": [], "gt_object_class": self.class_code_to_class_id[ann_dict["class_code"]], } if ann_dict["part_masks"] is None or len(ann_dict["part_masks"]) == 0: - return + return for i, (lbl, segm) in enumerate(zip(ann_dict["part_labels"], ann_dict["part_masks"])): # filter each part size - if "part_ratios" not in ann_dict or ann_dict["part_ratios"][i] >= self.min_area_ratio: - # filter each part score - if "part_scores" not in ann_dict or ann_dict["part_scores"][i] >= self.min_score: - new_dict["pseudo_annotations"].append({"segmentation": segm["segmentation"], - "category_id": lbl}) - height, width = segm["segmentation"]["size"] - new_dict["height"] = height - new_dict["width"] = width + # if "part_ratios" not in ann_dict or ann_dict["part_ratios"][i] >= self.min_area_ratio: + # # filter each part score + # if "part_scores" not in ann_dict or ann_dict["part_scores"][i] >= self.min_score: + new_dict["pseudo_annotations"].append({"segmentation": segm["segmentation"], + "category_id": lbl}) + height, width = segm["segmentation"]["size"] + new_dict["height"] = height + new_dict["width"] = width if len(new_dict["pseudo_annotations"]) > 0: return new_dict @@ -176,14 +178,14 @@ def __call__(self, _dataset_dict): if isinstance(_dataset_dict, tuple): _dataset_dict = self.load_annotation(_dataset_dict) if _dataset_dict is None: - return + return for _ in range(self.num_repeats): dataset_dict = self._forward(_dataset_dict, self.augs) if dataset_dict["instances"].has("gt_masks") \ - and len(dataset_dict["instances"]) > 0: + and len(dataset_dict["instances"]) > 0: - return dataset_dict + return dataset_dict self.logger.info("Max number of repeats for data augmentation has reached.") self.logger.info("Processing with weak augmentation instead ...\n") @@ -196,20 +198,20 @@ def __call__(self, _dataset_dict): def _forward(self, _dataset_dict, aug): dataset_dict = copy.deepcopy(_dataset_dict) # it will be modified by code below - image_orig = utils.read_image(dataset_dict["file_name"], format=self.img_format) - image, _ = T.apply_transform_gens(self.base_aug, image_orig) + image_orig = utils.read_image(dataset_dict["file_name"], format=self.img_format) + image, _ = T.apply_transform_gens(self.base_aug, image_orig) - padding_mask = np.ones(image.shape[:2]) + padding_mask = np.zeros(image.shape[:2]) image, transforms = T.apply_transform_gens(aug, image) - + # the crop transformation has default padding value 0 for segmentation padding_mask = transforms.apply_segmentation(padding_mask) padding_mask = ~ padding_mask.astype(bool) - image_shape = image.shape[:2] # h, w - + image_shape = image.shape[:2] # h, w + dataset_dict["height"] = image.shape[0] dataset_dict["width"] = image.shape[1] - + self._transform_annotations(dataset_dict, transforms, image_shape) # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, @@ -226,21 +228,27 @@ def _forward(self, _dataset_dict, aug): def _transform_annotations(self, dataset_dict, transforms, image_shape): parts_list = dataset_dict["pseudo_annotations"] - # NOTE: We do not use these information for pseudo label, but - # to make the below functions happy we need them. + # NOTE: We do not use these information for pseudo label, but + # to make the below functions happy we need them. # NOTE: set "by_box=False" for filtering empty instances !!! for part in parts_list: part["bbox"] = [0, 0, image_shape[0], image_shape[1]] - part["bbox_mode"] = BoxMode.XYXY_ABS + part["bbox_mode"] = BoxMode.XYXY_ABS if "category_id" not in part: part["category_id"] = -1 - + # Get flat list of annotations. annos = [utils.transform_instance_annotations( part, transforms, image_shape, ) for part in parts_list] + + # NOTE: detectron2 pads 255 instead of 0 so make sure padding is correct. + if annos[0]['segmentation'].dtype == np.uint8: + masks = torch.tensor([_['segmentation'] for _ in annos]) + for ann in annos: + ann['segmentation'][ann['segmentation']==255] = 0 # Convert to instances. instances = utils.annotations_to_instances( @@ -249,7 +257,7 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): if hasattr(instances, 'gt_masks'): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = utils.filter_empty_instances(instances, by_box=False) - + new_instances = Instances(instances.image_size) masks = instances.gt_masks.tensor ratio = masks.flatten(1).sum(-1) / masks.sum() @@ -257,4 +265,6 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): new_instances.set("gt_masks", BitMasks(masks[index])) new_instances.set("gt_classes", torch.tensor(instances.gt_classes[index])) - dataset_dict["instances"] = new_instances + dataset_dict["instances"] = new_instances + + diff --git a/part_distillation/data/dataset_mappers/part_imagenet_mapper.py b/part_distillation/data/dataset_mappers/part_imagenet_mapper.py index 98d2457..99aef36 100644 --- a/part_distillation/data/dataset_mappers/part_imagenet_mapper.py +++ b/part_distillation/data/dataset_mappers/part_imagenet_mapper.py @@ -2,9 +2,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + + import copy import logging -import os +import os import numpy as np import torch @@ -26,7 +28,7 @@ def correct_part_imagenet_path(_dataset_dict): iname = _dataset_dict["file_name"].split('/')[-1] path = "/".join(_dataset_dict["file_name"].split('/')[:-1]) path = os.path.join(path, fname, iname) - _dataset_dict["file_name"] = path + _dataset_dict["file_name"] = path _dataset_dict["class_code"] = fname @@ -69,10 +71,10 @@ def __init__( self.num_repeats = 20 # number of repeats until give up. self.use_merged_gt = use_merged_gt self.class_code_to_class_id = class_code_to_class_id - + @classmethod - def from_config(cls, cfg, is_train=True): + def from_config(cls, cfg, dataset_name="part_imagenet_valtest", is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( @@ -81,7 +83,7 @@ def from_config(cls, cfg, is_train=True): cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] - + if is_train: augs.append(T.RandomFlip()) if cfg.INPUT.COLOR_AUG_SSD: @@ -94,10 +96,10 @@ def from_config(cls, cfg, is_train=True): ) ) - # NOTE:This needs to be always from imagenet_1k_train! - class_code_to_class_id = MetadataCatalog.get("imagenet_1k_meta_train").class_code_to_class_id - - # NOTE: Need to convert to the proper vocabulary. + # NOTE:This needs to be always from imagenet_1k_train! + class_code_to_class_id = MetadataCatalog.get(dataset_name).imagenet_1k_class_code_to_class_id + + # NOTE: Need to convert to the proper vocabulary. if "22k" in cfg.DATASETS.TRAIN[0]: map_1k_to_22k = torch.load("metadata/imagenet1k_to_22k_mapping.pkl") class_code_to_class_id = {k: map_1k_to_22k[i] for k, i in class_code_to_class_id.items()} @@ -126,15 +128,15 @@ def _forward_with_aug(self, _dataset_dict, aug): self._transform_part_annotations(dataset_dict, transforms, image_shape) self._transform_object_annotations(dataset_dict, transforms, image_shape) - + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) - + if hasattr(dataset_dict["part_instances"], "gt_masks"): return dataset_dict - + def __call__(self, _dataset_dict): @@ -153,8 +155,8 @@ def __call__(self, _dataset_dict): for _ in range(self.num_repeats): dataset_dict = self._forward_with_aug(_dataset_dict, self.aug) if dataset_dict["part_instances"].has("gt_masks") \ - and len(dataset_dict["part_instances"]) > 0: - return dataset_dict + and len(dataset_dict["part_instances"]) > 0: + return dataset_dict return self._forward_with_aug(_dataset_dict, []) else: @@ -162,9 +164,9 @@ def __call__(self, _dataset_dict): - def _transform_part_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_part_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): annos = [ utils.transform_instance_annotations(obj, transforms, image_shape, keypoint_hflip_indices=False) @@ -182,7 +184,7 @@ def _transform_part_annotations(self, if self.use_merged_gt: mask_all = instances.gt_masks.tensor label_all = instances.gt_classes - unique_classes = instances.gt_classes.unique() + unique_classes = instances.gt_classes.unique() merged_masks = [] for c in unique_classes: merged_masks.append(mask_all[label_all==c].sum(0)) @@ -196,9 +198,9 @@ def _transform_part_annotations(self, # some annotation has no part (will be resampled). dataset_dict["part_instances"] = instances - def _transform_object_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_object_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): if hasattr(dataset_dict["part_instances"], "gt_masks"): @@ -209,3 +211,4 @@ def _transform_object_annotations(self, new_instances.set("gt_classes", torch.tensor([self.class_code_to_class_id[class_code]])) dataset_dict["instances"] = new_instances + diff --git a/part_distillation/data/dataset_mappers/proposal_dataset_mapper.py b/part_distillation/data/dataset_mappers/proposal_dataset_mapper.py index e894d9e..de5ab36 100644 --- a/part_distillation/data/dataset_mappers/proposal_dataset_mapper.py +++ b/part_distillation/data/dataset_mappers/proposal_dataset_mapper.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch from typing import Tuple, Union, Any, List @@ -16,7 +17,7 @@ from detectron2.data.transforms import TransformGen from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances, BoxMode -import copy +import copy __all__ = ["ProposalDatasetMapper"] @@ -43,20 +44,20 @@ def __init__( self.min_object_area_ratio = min_object_area_ratio self.min_area_ratio = min_area_ratio self.class_code_to_class_id = class_code_to_class_id - self.logger = logging.getLogger("part_distillation") - + self.logger = logging.getLogger("part_distillation") + @classmethod def from_config(cls, cfg, is_train=True, base_size=-1): image_size = cfg.INPUT.IMAGE_SIZE - aug_name_list = cfg.CUSTOM_DATASETS.AUG_NAME_LIST + aug_name_list = cfg.CUSTOM_DATASETS.AUG_NAME_LIST class_code_to_class_id = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).class_code_to_class_id - + base_aug = [] if base_size > 0: - # Base size if pseudo-labels are done after resizing. - base_aug.append(T.ResizeScale(min_scale=1.0, - max_scale=1.0, - target_height=base_size, + # Base size if pseudo-labels are done after resizing. + base_aug.append(T.ResizeScale(min_scale=1.0, + max_scale=1.0, + target_height=base_size, target_width=base_size)) augs, weak_augs = [], [] @@ -75,23 +76,23 @@ def from_config(cls, cfg, is_train=True, base_size=-1): if "scale" in aug_name_list: min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE - augs.extend([T.ResizeScale(min_scale=min_scale, - max_scale=max_scale, - target_height=image_size, + augs.extend([T.ResizeScale(min_scale=min_scale, + max_scale=max_scale, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) else: - # No resizing but pad to make it square shape. - augs.extend([T.ResizeScale(min_scale=1.0, - max_scale=1.0, - target_height=image_size, + # No resizing but pad to make it square shape. + augs.extend([T.ResizeScale(min_scale=1.0, + max_scale=1.0, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) - weak_augs.extend([T.ResizeScale(min_scale=1.0, - max_scale=1.0, - target_height=image_size, + weak_augs.extend([T.ResizeScale(min_scale=1.0, + max_scale=1.0, + target_height=image_size, target_width=image_size), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) @@ -108,36 +109,36 @@ def from_config(cls, cfg, is_train=True, base_size=-1): return ret - + # If dataset is registered with [path_only] flag. - def load_annotation(self, path_tuple): - dataset_path, fname, ann_name = path_tuple + def load_annotation(self, dataset_path, fname, ann_name): try: ann_dict = torch.load(os.path.join(dataset_path, fname, ann_name)) - except EOFError: + except: self.logger.info(os.path.join(dataset_path, fname, ann_name), " is corrupted.") - return + return + if ann_dict["object_ratio"] > self.min_object_area_ratio: new_dict = {"file_name": ann_dict["file_path"], "image_id": ann_dict["file_name"], "class_code": fname, "height": None, "width": None, - "pseudo_annotations": [], + "pseudo_annotations": [], "gt_object_class": self.class_code_to_class_id[ann_dict["class_code"]], } if ann_dict["part_mask"] is None or len(ann_dict["part_mask"]) == 0: - return + return for segm in ann_dict["part_mask"]: new_dict["pseudo_annotations"].append({"segmentation": segm["segmentation"], - "category_id": 0}) # class-agnostic -> postive = 0. + "category_id": 0}) # class-agnostic -> postive = 0. height, width = segm["segmentation"]["size"] - new_dict["height"] = height + new_dict["height"] = height new_dict["width"] = width if len(new_dict["pseudo_annotations"]) > 0: return new_dict - + def __call__(self, _dataset_dict): @@ -149,16 +150,17 @@ def __call__(self, _dataset_dict): dict: a format that builtin models in detectron2 accept """ if isinstance(_dataset_dict, tuple): - _dataset_dict = self.load_annotation(_dataset_dict) + dataset_path, fname, ann_name = _dataset_dict + _dataset_dict = self.load_annotation(dataset_path, fname, ann_name) if _dataset_dict is None: - return + return for _ in range(self.num_repeats): dataset_dict = self._forward(_dataset_dict, self.augs) if dataset_dict["instances"].has("gt_masks") \ - and len(dataset_dict["instances"]) > 0: + and len(dataset_dict["instances"]) > 0: - return dataset_dict + return dataset_dict self.logger.info("Max number of repeats for data augmentation has reached.") self.logger.info("Processing with weak augmentation instead ...\n") dataset_dict = self._forward(_dataset_dict, self.weak_augs) @@ -170,18 +172,18 @@ def __call__(self, _dataset_dict): def _forward(self, _dataset_dict, aug): dataset_dict = copy.deepcopy(_dataset_dict) # it will be modified by code below - image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) if len(self.base_aug) > 0: image, _ = T.apply_transform_gens(self.base_aug, image) utils.check_image_size(dataset_dict, image) - - padding_mask = np.ones(image.shape[:2]) + # print(image.shape, flush=True) + padding_mask = np.zeros(image.shape[:2]) image, transforms = T.apply_transform_gens(aug, image) padding_mask = transforms.apply_segmentation(padding_mask) + # print(padding_mask.astype(bool).sum(), (~padding_mask.astype(bool)).sum(), flush=True) padding_mask = ~ padding_mask.astype(bool) image_shape = image.shape[:2] # h, w - - # For visualization. + # For visualization. dataset_dict["height"] = image_shape[0] dataset_dict["width"] = image_shape[1] @@ -201,26 +203,33 @@ def _forward(self, _dataset_dict, aug): def _transform_annotations(self, dataset_dict, transforms, image_shape): parts_list = dataset_dict["pseudo_annotations"] - # NOTE: We do not use these information for pseudo label, but - # to make the below functions happy we need them. + # NOTE: We do not use these information for pseudo label, but + # to make the below functions happy we need them. # NOTE: set "by_box=False" for filtering empty instances !!! for part in parts_list: part["bbox"] = [0, 0, image_shape[0], image_shape[1]] - part["bbox_mode"] = BoxMode.XYXY_ABS + part["bbox_mode"] = BoxMode.XYXY_ABS if "category_id" not in part: part["category_id"] = -1 - + # Get flat list of annotations. annos = [utils.transform_instance_annotations( part, transforms, image_shape, ) for part in parts_list] + + # NOTE: detectron2 pads 255 instead of 0 so make sure padding is correct. + if annos[0]['segmentation'].dtype == np.uint8: + masks = torch.tensor([_['segmentation'] for _ in annos]) + for ann in annos: + ann['segmentation'][ann['segmentation']==255] = 0 # Convert to instances. instances = utils.annotations_to_instances( annos, image_shape, mask_format=self.instance_mask_format ) + # print(640*640, instances.gt_masks.tensor.sum(), flush=True) if hasattr(instances, 'gt_masks'): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = utils.filter_empty_instances(instances, by_box=False) @@ -231,5 +240,6 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): index = ratio > self.min_area_ratio new_instances.set("gt_masks", BitMasks(masks[index])) new_instances.set("gt_classes", torch.tensor(instances.gt_classes[index])) + + dataset_dict["instances"] = new_instances - dataset_dict["instances"] = new_instances diff --git a/part_distillation/data/dataset_mappers/proposal_generation_mapper.py b/part_distillation/data/dataset_mappers/proposal_generation_mapper.py index 621bf7a..8c7d530 100644 --- a/part_distillation/data/dataset_mappers/proposal_generation_mapper.py +++ b/part_distillation/data/dataset_mappers/proposal_generation_mapper.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch from detectron2.config import configurable @@ -27,8 +28,8 @@ def __init__( self.aug = augmentations self.img_format = image_format self.with_given_mask = with_given_mask - self.logger = logging.getLogger("part_distillation") - + self.logger = logging.getLogger("part_distillation") + @classmethod def from_config(cls, cfg): # Build augmentation @@ -48,10 +49,7 @@ def from_config(cls, cfg): return ret def __call__(self, dataset_dict): - try: - image_original = utils.read_image(dataset_dict["file_path"], format=self.img_format) - except: - return + image_original = utils.read_image(dataset_dict["file_path"], format=self.img_format) utils.check_image_size(dataset_dict, image_original) image, _ = T.apply_transform_gens(self.aug, image_original) @@ -65,29 +63,33 @@ def __call__(self, dataset_dict): dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if self.with_given_mask: - self._transform_annotations(dataset_dict, [], image_shape) + pseudo_annotations = torch.load(dataset_dict['object_mask_path']) + if len(pseudo_annotations["object_masks"]) > 0: + dataset_dict["pseudo_annotations"] = [{"segmentation" : pseudo_annotations["object_masks"][0]["segmentation"]}] + self._transform_annotations(dataset_dict, [], image_shape) + else: + self.logger.info("No object mask detected on {}.".format(dataset_dict['object_mask_path'])) + return None if not dataset_dict["instances"].has("gt_masks") or len(dataset_dict["instances"]) == 0: self.logger.info("No mask detected on {}.".format(dataset_dict["file_path"])) - return None - else: - return dataset_dict - else: - return dataset_dict - - + return None + return dataset_dict + + + def _transform_annotations(self, dataset_dict, transforms, image_shape): object_list = dataset_dict["pseudo_annotations"] - # NOTE: We do not use these information for pseudo label, but - # to make the below functions happy we need them. + # NOTE: We do not use these information for pseudo label, but + # to make the below functions happy we need them. # NOTE: set "by_box=False" for filtering empty instances !!! for obj in object_list: obj["bbox"] = [0, 0, image_shape[0], image_shape[1]] - obj["bbox_mode"] = BoxMode.XYXY_ABS + obj["bbox_mode"] = BoxMode.XYXY_ABS if "category_id" not in obj: obj["category_id"] = -1 - + # Get flat list of annotations. annos = [utils.transform_instance_annotations( obj, @@ -102,5 +104,8 @@ def _transform_annotations(self, dataset_dict, transforms, image_shape): if hasattr(instances, 'gt_masks'): instances.gt_boxes = instances.gt_masks.get_bounding_boxes() instances = utils.filter_empty_instances(instances, by_box=False) + + dataset_dict["instances"] = instances + + - dataset_dict["instances"] = instances diff --git a/part_distillation/data/dataset_mappers/voc_parts_mapper.py b/part_distillation/data/dataset_mappers/voc_parts_mapper.py index 72cedea..e319c42 100644 --- a/part_distillation/data/dataset_mappers/voc_parts_mapper.py +++ b/part_distillation/data/dataset_mappers/voc_parts_mapper.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging -import os +import os import numpy as np import torch from typing import Tuple, Union, Any, List, Dict @@ -29,7 +30,7 @@ def __init__( aug_without_crop, image_format, size_divisibility, - instance_mask_format: str = "bitmask", + instance_mask_format: str = "bitmask", use_merged_gt: bool=False, ): self.is_train = is_train @@ -40,7 +41,7 @@ def __init__( self.instance_mask_format = instance_mask_format self.num_repeats = 100 # number of repeats until give up. self.use_merged_gt = use_merged_gt - + @classmethod def from_config(cls, cfg, is_train=True): augs_without_crop = [] @@ -80,6 +81,8 @@ def _forward_with_aug(self, _dataset_dict, aug): image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) + # print(len(dataset_dict['annotations']), flush=True) + aug_input = T.AugInput(image) aug_input, transforms = T.apply_transform_gens(aug, aug_input) image = aug_input.image @@ -96,9 +99,9 @@ def _forward_with_aug(self, _dataset_dict, aug): dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) del dataset_dict["annotations"] del dataset_dict["part_annotations"] - + return dataset_dict - + def __call__(self, _dataset_dict): @@ -113,24 +116,24 @@ def __call__(self, _dataset_dict): for _ in range(self.num_repeats): dataset_dict = self._forward_with_aug(_dataset_dict, self.aug) if dataset_dict["part_instances"].has("gt_masks") \ - and dataset_dict["part_instances"].gt_masks.tensor.shape[0] > 1: - return dataset_dict + and dataset_dict["part_instances"].gt_masks.tensor.shape[0] > 1: + return dataset_dict return self._forward_with_aug(_dataset_dict, self.aug_without_crop) else: return self._forward_with_aug(_dataset_dict, self.aug) - def _transform_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): annos = [ utils.transform_instance_annotations(obj, transforms, image_shape, keypoint_hflip_indices=False) for obj in dataset_dict["annotations"] if obj.get("iscrowd", 0) == 0 ] - instances = utils.annotations_to_instances(annos, image_shape, + instances = utils.annotations_to_instances(annos, image_shape, mask_format=self.instance_mask_format) obj_mapping = [obj_id for obj_id, obj in enumerate(dataset_dict["annotations"])] instances.obj_mapping = torch.tensor(obj_mapping, dtype=torch.int64) @@ -138,9 +141,9 @@ def _transform_annotations(self, dataset_dict["instances"] = utils.filter_empty_instances(instances) - def _transform_part_annotations(self, - dataset_dict: Dict[str, Any], - transforms: Any, + def _transform_part_annotations(self, + dataset_dict: Dict[str, Any], + transforms: Any, image_shape: Tuple): parts_list = [ part_ann @@ -174,7 +177,7 @@ def _transform_part_annotations(self, [i for i, _ in enumerate(flat_part_segs)], dtype=torch.int64 ) instances = utils.filter_empty_instances(instances) - + # save original part masks for evaluation dataset_dict["orig_part_maps"] = [ parts diff --git a/part_distillation/data/datasets/pascal_info.py b/part_distillation/data/datasets/pascal_info.py index 2d887a0..ae7a8d9 100644 --- a/part_distillation/data/datasets/pascal_info.py +++ b/part_distillation/data/datasets/pascal_info.py @@ -1,8 +1,3 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - """ Contains information about the pascal parts categories. http://roozbehm.info/pascal-parts/pascal-parts.html diff --git a/part_distillation/data/datasets/pascal_voc.py b/part_distillation/data/datasets/pascal_voc.py new file mode 100644 index 0000000..4e8fdb0 --- /dev/null +++ b/part_distillation/data/datasets/pascal_voc.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import numpy as np +import os +import xml.etree.ElementTree as ET +from typing import List, Tuple, Union + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode +from detectron2.utils.file_io import PathManager + +__all__ = ["load_voc_instances", "register_pascal_voc"] + + +# fmt: off +CLASS_NAMES = ( + "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", + "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", + "pottedplant", "sheep", "sofa", "train", "tvmonitor" +) +# fmt: on + + +def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): + """ + Load Pascal VOC detection annotations to Detectron2 format. + + Args: + dirname: Contain "Annotations", "ImageSets", "JPEGImages" + split (str): one of "train", "test", "val", "trainval" + class_names: list or tuple of class names + """ + with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: + # fileids = np.loadtxt(f, dtype=np.str) + fileids = np.loadtxt(f, dtype=str) + + # Needs to read many small annotation files. Makes sense at local + annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/")) + dicts = [] + for fileid in fileids: + anno_file = os.path.join(annotation_dirname, fileid + ".xml") + jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") + + with PathManager.open(anno_file) as f: + tree = ET.parse(f) + + r = { + "file_name": jpeg_file, + "image_id": fileid, + "height": int(tree.findall("./size/height")[0].text), + "width": int(tree.findall("./size/width")[0].text), + } + instances = [] + + for obj in tree.findall("object"): + cls = obj.find("name").text + # We include "difficult" samples in training. + # Based on limited experiments, they don't hurt accuracy. + # difficult = int(obj.find("difficult").text) + # if difficult == 1: + # continue + bbox = obj.find("bndbox") + bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] + # Original annotations are integers in the range [1, W or H] + # Assuming they mean 1-based pixel indices (inclusive), + # a box with annotation (xmin=1, xmax=W) covers the whole image. + # In coordinate space this is represented by (xmin=0, xmax=W) + bbox[0] -= 1.0 + bbox[1] -= 1.0 + instances.append( + {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS} + ) + r["annotations"] = instances + dicts.append(r) + return dicts + + +def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES): + DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names)) + MetadataCatalog.get(name).set( + thing_classes=list(class_names), dirname=dirname, year=year, split=split + ) \ No newline at end of file diff --git a/part_distillation/data/datasets/register_cityscapes_part.py b/part_distillation/data/datasets/register_cityscapes_part.py index dcfb31a..f6461b8 100644 --- a/part_distillation/data/datasets/register_cityscapes_part.py +++ b/part_distillation/data/datasets/register_cityscapes_part.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import os -import copy +import copy import numpy as np import pycocotools.mask as mask_util import panoptic_parts as pp @@ -15,7 +16,7 @@ from detectron2.data.datasets.cityscapes import load_cityscapes_instances from detectron2.data import detection_utils as utils from detectron2.utils.file_io import PathManager -from PIL import Image +from PIL import Image CITYSCAPES_DATASET_ROOT = "datasets/cityscapes_part/" CITYSCAPES_DATASET_IMAGES = CITYSCAPES_DATASET_ROOT + "leftImg8bit/" @@ -36,19 +37,19 @@ def load_object_and_parts(dict, file_path): """ - Object classes: 24, 25, 26, 27, 28 (5 classes). - - Object class starts from 24 and ends with 28. - Part classes: 15 + 8 = 23. - - Part label starts from 1, and ends with either 4 or 5. - - -1 is ignore, and 0 is unlabeled/void. + Object classes: 24, 25, 26, 27, 28 (5 classes). + - Object class starts from 24 and ends with 28. + Part classes: 15 + 8 = 23. + - Part label starts from 1, and ends with either 4 or 5. + - -1 is ignore, and 0 is unlabeled/void. """ anns_size = (dict["height"], dict["width"]) - instances = utils.annotations_to_instances(dict["annotations"], - anns_size, + instances = utils.annotations_to_instances(dict["annotations"], + anns_size, mask_format="bitmask") - + object_instances = [] - part_instances = [] + part_instances = [] if hasattr(instances, "gt_masks"): obj_classes = instances.gt_classes[instances.gt_classes < 5] obj_masks = instances.gt_masks.tensor[instances.gt_classes < 5].numpy() @@ -70,21 +71,21 @@ def load_object_and_parts(dict, file_path): part_instances_per_object = [] for _pid in np.unique(part_map): # ignore -1 and 0. - if _pid > 0: - part_id = PART_BASE_ID[object_category_id] + _pid-1 # shifting to make it 0 start. + if _pid > 0: + part_id = PART_BASE_ID[object_category_id] + _pid-1 # shifting to make it 0 start. part_dict = {"part_category": PART_CLASSES[part_id], - "part_category_id": part_id, + "part_category_id": part_id, "category_id": part_id, # For histogram printing. "object_index": instance_id, "segmentation": mask_util.encode(np.asfortranarray(np.where(part_map==_pid, True, False))), } part_instances_per_object.append(part_dict) - + # some object has no parts. if len(part_dict) > 0: object_instances.append(object_dict) part_instances.append(part_instances_per_object) - + return object_instances, part_instances @@ -94,17 +95,17 @@ def load_cityscapes_object_part_instances( annotations_dirname: str, split: str, path_only: bool=False, - label_percentage: int=100, - for_segmentation: bool=False, + label_percentage: int=100, + for_segmentation: bool=False, debug: bool=False, -): +): logger = logging.getLogger("part_distillation") logger.info("Starting loading cityscapes part data") - + if len(images_dirname) == 0: - images_dirname = CITYSCAPES_DATASET_IMAGES + images_dirname = CITYSCAPES_DATASET_IMAGES if len(annotations_dirname) == 0: - annotations_dirname = CITYSCAPES_DATASET_OBJ_ANNS + annotations_dirname = CITYSCAPES_DATASET_OBJ_ANNS original_dicts = load_cityscapes_instances(images_dirname + split, annotations_dirname + split) if label_percentage < 100: @@ -115,7 +116,7 @@ def load_cityscapes_object_part_instances( threshold = int(len(original_dicts) * label_percentage / 100) original_dicts = original_dicts[:threshold] logger.info("{} original dicts used.".format(len(original_dicts))) - + dict_list = [] for dict in original_dicts: city_name = dict["image_id"].split("_")[0] @@ -127,7 +128,7 @@ def load_cityscapes_object_part_instances( else: object_instances, part_instances = load_object_and_parts(dict, part_file) if for_segmentation: - # for segmentation, each instance is saved in a separate dict. + # for segmentation, each instance is saved in a separate dict. if len(part_instances) > 0: for object_annotation, part_annotations in zip(object_instances, part_instances): if len(part_annotations) > 0: @@ -137,14 +138,14 @@ def load_cityscapes_object_part_instances( dict_list.append(new_dict) else: if len(part_instances) > 0: - dict["annotations"] = object_instances + dict["annotations"] = object_instances dict["part_annotations"] = part_instances dict_list.append(dict) logger.info("{} annotation dicts registered in total.".format(len(dict_list))) if debug and len(dict_list) > 10: - return dict_list - + return dict_list + return dict_list @@ -155,16 +156,16 @@ def register_cityscapes_part(name: str, path_only=False, label_percentage: int=100, for_segmentation: bool=False, - debug=False, - ): + debug=False, + ): DatasetCatalog.register( name, lambda: load_cityscapes_object_part_instances( images_dirname, annotations_dirname, - split, + split, path_only=path_only, - label_percentage=label_percentage, + label_percentage=label_percentage, for_segmentation=for_segmentation, debug=debug, ), @@ -175,3 +176,5 @@ def register_cityscapes_part(name: str, classes=PART_CLASSES, split=split, ) + + diff --git a/part_distillation/data/datasets/register_imagenet.py b/part_distillation/data/datasets/register_imagenet.py index 2630f2e..104c5af 100644 --- a/part_distillation/data/datasets/register_imagenet.py +++ b/part_distillation/data/datasets/register_imagenet.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import os import logging -import torch +import torch from typing import List from detectron2.data import DatasetCatalog, MetadataCatalog @@ -15,24 +16,25 @@ PART_IMAGENET_CLASSES_TRAIN = os.listdir("datasets/part_imagenet/train") PART_IMAGENET_CLASSES_VAL = os.listdir("datasets/part_imagenet/val") PART_IMAGENET_CLASSES_TEST = os.listdir("datasets/part_imagenet/test") +METADATA_PATH = "datasets/metadata/" - -def load_imagenet_images(fname_to_cname_dict, - dataset_path, split, - class_code_to_class_id, - save_path, - with_given_mask=False, +def load_imagenet_images(fname_to_cname_dict, + dataset_path, split, + class_code_to_class_id, + save_path, + with_given_mask=False, object_mask_path="", debug=False): logger = logging.getLogger("part_distillation") logger.info("Starting loading imagenet data.") - + dict_list = [] done_already = 0 total_num = 0 filename_list = [fname for fname in fname_to_cname_dict.keys() if fname in os.listdir(dataset_path)] if debug: filename_list = filename_list[:100] + for fname in filename_list: image_list = os.listdir(os.path.join(dataset_path, fname)) if debug: @@ -45,20 +47,15 @@ def load_imagenet_images(fname_to_cname_dict, "class_code": fname, "gt_object_class": class_code_to_class_id[fname], "class_name": fname_to_cname_dict[fname]} - if with_given_mask: if os.path.exists(os.path.join(object_mask_path, fname, iname)): - object_data = torch.load(os.path.join(object_mask_path, fname, iname)) - if len(object_data["object_masks"]) > 0: - # object masks are ordered by confidence already (use most confident mask). - data["pseudo_annotations"] = [{"segmentation" : object_data["object_masks"][0]["segmentation"]}] - dict_list.append(data) + data['object_mask_path'] = os.path.join(object_mask_path, fname, iname) + dict_list.append(data) else: dict_list.append(data) else: done_already += 1 - logger.info("Progress: {}/{} ({} to go!)".format(done_already, total_num, len(dict_list))) - + logger.info("Loading imagenet done. (loaded: {}/ done before: {}/ total: {})".format(len(dict_list), done_already, total_num)) return dict_list @@ -67,7 +64,7 @@ def register_imagenet( name: str, split: str, partitioned_imagenet: bool=True, - total_partitions: int=10, + total_partitions: int=10, partition_index: int=0, save_path: str="", with_given_mask:bool=False, @@ -76,16 +73,14 @@ def register_imagenet( exclude_code_path: str="", single_class_code: str="", use_part_imagenet_classes: bool=False, - debug=False, -): + debug=False, +): logger = logging.getLogger("part_distillation") logger.info("Start registering imagenet dataset.") if "1k" in name: imagenet_size = "1k" dataset_path = IMAGENET_1K_DATASET_PATH + "train" - with open(os.path.join(IMAGENET_1K_DATASET_PATH, "labels.txt"), "r") as f: - fname_cname_pair_list = f.readlines() - fname_to_classname = {x.split(',')[0]: x.split(',')[1].strip() for x in fname_cname_pair_list} + fname_to_classname = torch.load(os.path.join(METADATA_PATH, 'imagenet_1k_fname_classname_dict.pkl')) elif "22k" in name: imagenet_size = "22k" dataset_path = IMAGENET_22K_DATASET_PATH @@ -96,10 +91,10 @@ def register_imagenet( fname_cname_pair_list = f.readlines() fname_to_classname = {x.split('\t')[0]: x.split('\t')[1].strip() for x in fname_cname_pair_list} fname_to_classname = {k:v for k, v in fname_to_classname.items() if k in class_code_list} - elif use_part_imagenet_classes: + elif use_part_imagenet_classes: PART_IMAGENET_CLASSES = [] if "val" in split: - PART_IMAGENET_CLASSES += PART_IMAGENET_CLASSES_VAL + PART_IMAGENET_CLASSES += PART_IMAGENET_CLASSES_VAL if "train" in split: PART_IMAGENET_CLASSES += PART_IMAGENET_CLASSES_TRAIN if "test" in split: @@ -109,7 +104,7 @@ def register_imagenet( else: raise ValueError("{} not supported.".format(name)) - # Use subset classes. + # Use subset classes. for filtered_code_path in filtered_code_path_list: if len(filtered_code_path) > 0: filtered_code_list = torch.load(filtered_code_path) @@ -118,37 +113,39 @@ def register_imagenet( fname_to_classname = {k: v for k, v in fname_to_classname.items() if k == single_class_code} if len(exclude_code_path) > 0: exclude_code_list = torch.load(exclude_code_path) - fname_to_classname = {k:v for k, v in fname_to_classname.items() if k not in exclude_code_list} - class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} - + fname_to_classname = {k: v for k, v in fname_to_classname.items() if k not in exclude_code_list} + class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} + key_list_all = list(fname_to_classname.keys()) if partitioned_imagenet: - # Parallelize the preprocessing. + # Parallelize the preprocessing. partition_size = len(key_list_all) // total_partitions - start_i = partition_index * partition_size + start_i = partition_index * partition_size end_i = (partition_index+1) * partition_size if partition_index + 1 < total_partitions else len(list(fname_to_classname.keys())) key_list = list(key_list_all)[start_i: end_i] - fname_to_classname = {k: fname_to_classname[k] for k in key_list} - logger.info("{}/{} classes used.".format(len(fname_to_classname), len(key_list_all))) - + fname_to_classname_local = {k: fname_to_classname[k] for k in key_list} + else: + fname_to_classname_local = fname_to_classname + logger.info("{}/{} classes used.".format(len(fname_to_classname_local), len(key_list_all))) + DatasetCatalog.register( name, lambda: load_imagenet_images( - fname_to_cname_dict=fname_to_classname, + fname_to_cname_dict=fname_to_classname_local, dataset_path=dataset_path, split=split, class_code_to_class_id=class_code_to_class_id, - save_path=save_path, - with_given_mask=with_given_mask, + save_path=save_path, + with_given_mask=with_given_mask, object_mask_path=object_mask_path, debug=debug, ), ) MetadataCatalog.get(name).set( - classes=list(fname_to_classname.values()), - class_codes=list(fname_to_classname.keys()), - fname_to_classname=fname_to_classname, + classes=list(fname_to_classname.values()), # used together with [class_code_to_class_id] which is global. + class_codes=list(fname_to_classname_local.keys()), + fname_to_classname=fname_to_classname_local, class_code_to_class_id=class_code_to_class_id, save_path=save_path, split=split, diff --git a/part_distillation/data/datasets/register_imagenet_with_proposals.py b/part_distillation/data/datasets/register_imagenet_with_proposals.py index 4942426..cc8c3a4 100644 --- a/part_distillation/data/datasets/register_imagenet_with_proposals.py +++ b/part_distillation/data/datasets/register_imagenet_with_proposals.py @@ -3,16 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import os import logging -import torch +import torch from detectron2.data import DatasetCatalog, MetadataCatalog from typing import List IMAGENET_1K_DATASET_PATH = "datasets/imagenet_1k/" IMAGENET_22K_DATASET_PATH = "datasets/imagenet_22k/" EXCLUDE_CODE_PATH = "datasets/metadata/exclude_code_list.pkl" - +METADATA_PATH = "datasets/metadata/" def load_multiple_imagenet_images(filename_list, dataset_path_list, min_object_area_ratio, class_code_to_class_id, path_only=False, debug=False): logger = logging.getLogger("part_distillation") @@ -33,7 +34,7 @@ def load_imagenet_images(filename_list, dataset_path, min_object_area_ratio, cla logger.info("Start loading imagenet data images and proposals from {}.".format(dataset_path)) dict_list = [] - count = 0 + count = 0 used = 0 filename_list = [fname for fname in filename_list if fname in os.listdir(dataset_path)] if debug: @@ -54,7 +55,7 @@ def load_imagenet_images(filename_list, dataset_path, min_object_area_ratio, cla ann_dict = torch.load(ann_path) except EOFError: print(ann_path, " is corrupted.", flush=True) - continue + continue if ann_dict["object_ratio"] > min_object_area_ratio: new_dict = {"file_name": ann_dict["file_path"], "image_id": ann_dict["file_name"], @@ -64,11 +65,11 @@ def load_imagenet_images(filename_list, dataset_path, min_object_area_ratio, cla "width": None, "pseudo_annotations": []} if ann_dict["part_mask"] is None: - continue + continue for segm in ann_dict["part_mask"]: new_dict["pseudo_annotations"].append({"segmentation": segm["segmentation"]}) height, width = segm["segmentation"]["size"] - new_dict["height"] = height + new_dict["height"] = height new_dict["width"] = width if len(new_dict["pseudo_annotations"]) > 0: dict_list.append(new_dict) @@ -82,23 +83,23 @@ def register_imagenet_with_proposals( name: str, dataset_path: str, split: str, - min_object_area_ratio: float=-1.0, + min_object_area_ratio: float=-1.0, partitioned_imagenet: bool=False, - total_partitions: int=10, + total_partitions: int=10, partition_index: int=0, dataset_path_list=[], filtered_code_path_list: List[str]=[""], exclude_code_path: str="", single_class_code: str="", path_only: bool=False, - debug=False, -): + debug=False, +): logger = logging.getLogger("part_distillation") logger.info("Start registering imagenet with proposals.") if "1k" in name: - with open(os.path.join(IMAGENET_1K_DATASET_PATH, "labels.txt"), "r") as f: - fname_cname_pair_list = f.readlines() - fname_to_classname = {x.split(',')[0]: x.split(',')[1].strip() for x in fname_cname_pair_list} + imagenet_size = "1k" + dataset_path = IMAGENET_1K_DATASET_PATH + "train" + fname_to_classname = torch.load(os.path.join(METADATA_PATH, 'imagenet_1k_fname_classname_dict.pkl')) elif "22k" in name: with open(os.path.join(IMAGENET_22K_DATASET_PATH, "synsets.dat"), "r") as f: class_code_list = f.readlines() @@ -107,8 +108,8 @@ def register_imagenet_with_proposals( fname_cname_pair_list = f.readlines() fname_to_classname = {x.split('\t')[0]: x.split('\t')[1].strip() for x in fname_cname_pair_list} fname_to_classname = {k:v for k, v in fname_to_classname.items() if k in class_code_list} - - # Use subset classes. + + # Use subset classes. for filtered_code_path in filtered_code_path_list: if len(filtered_code_path) > 0: filtered_code_list = torch.load(filtered_code_path) @@ -118,13 +119,13 @@ def register_imagenet_with_proposals( if len(exclude_code_path) > 0: exclude_code_list = torch.load(exclude_code_path) fname_to_classname = {k:v for k, v in fname_to_classname.items() if k not in exclude_code_list} - class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} + class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} key_list_all = list(fname_to_classname.keys()) if partitioned_imagenet: - # Parallelize the preprocessing. + # Parallelize the preprocessing. partition_size = len(key_list_all) // total_partitions - start_i = partition_index * partition_size + start_i = partition_index * partition_size end_i = (partition_index+1) * partition_size if partition_index + 1 < total_partitions else len(list(fname_to_classname.keys())) key_list = list(key_list_all)[start_i: end_i] fname_to_classname = {k: fname_to_classname[k] for k in key_list} @@ -132,7 +133,7 @@ def register_imagenet_with_proposals( if len(dataset_path_list) == 0: dataset_path_list = [dataset_path] - + DatasetCatalog.register( name, lambda: load_multiple_imagenet_images( @@ -152,3 +153,4 @@ def register_imagenet_with_proposals( class_code_to_class_id=class_code_to_class_id, split=split, ) + diff --git a/part_distillation/data/datasets/register_imagenet_with_segmentation.py b/part_distillation/data/datasets/register_imagenet_with_segmentation.py index abd4937..dbace46 100644 --- a/part_distillation/data/datasets/register_imagenet_with_segmentation.py +++ b/part_distillation/data/datasets/register_imagenet_with_segmentation.py @@ -3,16 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os -import logging -import torch + +import os +import logging +import torch from detectron2.data import DatasetCatalog, MetadataCatalog from typing import List IMAGENET_1K_DATASET_PATH = "datasets/imagenet_1k/" IMAGENET_22K_DATASET_PATH = "datasets/imagenet_22k/" -EXCLUDE_CODE_PATH = "datasets/metadata/imagenet_exclude_code_list.pkl" - +EXCLUDE_CODE_PATH = "datasets/metadata/exclude_code_list.pkl" +METADATA_PATH = "datasets/metadata/" def load_multiple_imagenet_images(filename_list, dataset_path_list, class_code_to_class_id, path_only=False, debug=False): logger = logging.getLogger("part_distillation") @@ -33,7 +34,7 @@ def load_imagenet_images(filename_list, dataset_path, class_code_to_class_id, pa logger.info("Start loading imagenet data path from {}.".format(dataset_path)) dict_list = [] - count = 0 + count = 0 used = 0 filename_list = [fname for fname in filename_list if fname in os.listdir(dataset_path)] if debug: @@ -59,21 +60,21 @@ def register_imagenet_with_segmentation( dataset_path: str, split: str, partitioned_imagenet: bool=False, - total_partitions: int=10, + total_partitions: int=10, partition_index: int=0, dataset_path_list=[], filtered_code_path_list: List[str]=[""], exclude_code_path: str="", single_class_code: str="", path_only: bool=False, - debug=False, + debug=False, ): logger = logging.getLogger("part_distillation") logger.info("Start registering imagenet with segmentation.") if "1k" in name: - with open(os.path.join(IMAGENET_1K_DATASET_PATH, "labels.txt"), "r") as f: - fname_cname_pair_list = f.readlines() - fname_to_classname = {x.split(',')[0]: x.split(',')[1].strip() for x in fname_cname_pair_list} + imagenet_size = "1k" + dataset_path = IMAGENET_1K_DATASET_PATH + "train" + fname_to_classname = torch.load(os.path.join(METADATA_PATH, 'imagenet_1k_fname_classname_dict.pkl')) elif "22k" in name: with open(os.path.join(IMAGENET_22K_DATASET_PATH, "synsets.dat"), "r") as f: class_code_list = f.readlines() @@ -82,8 +83,8 @@ def register_imagenet_with_segmentation( fname_cname_pair_list = f.readlines() fname_to_classname = {x.split('\t')[0]: x.split('\t')[1].strip() for x in fname_cname_pair_list} fname_to_classname = {k:v for k, v in fname_to_classname.items() if k in class_code_list} - - # Use subset classes. + + # Use subset classes. for filtered_code_path in filtered_code_path_list: if len(filtered_code_path) > 0: filtered_code_list = torch.load(filtered_code_path) @@ -93,13 +94,13 @@ def register_imagenet_with_segmentation( if len(exclude_code_path) > 0: exclude_code_list = torch.load(exclude_code_path) fname_to_classname = {k:v for k, v in fname_to_classname.items() if k not in exclude_code_list} - class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} + class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} key_list_all = list(fname_to_classname.keys()) if partitioned_imagenet: - # Parallelize the preprocessing. + # Parallelize the preprocessing. partition_size = len(key_list_all) // total_partitions - start_i = partition_index * partition_size + start_i = partition_index * partition_size end_i = (partition_index+1) * partition_size if partition_index + 1 < total_partitions else len(list(fname_to_classname.keys())) key_list = list(key_list_all)[start_i: end_i] fname_to_classname = {k: fname_to_classname[k] for k in key_list} @@ -107,7 +108,7 @@ def register_imagenet_with_segmentation( if len(dataset_path_list) == 0: dataset_path_list = [dataset_path] - + DatasetCatalog.register( name, lambda: load_multiple_imagenet_images( diff --git a/part_distillation/data/datasets/register_part_imagenet.py b/part_distillation/data/datasets/register_part_imagenet.py index 8d0635b..8fc42e6 100644 --- a/part_distillation/data/datasets/register_part_imagenet.py +++ b/part_distillation/data/datasets/register_part_imagenet.py @@ -3,8 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import os +import torch import numpy as np from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data.datasets.coco import load_coco_json @@ -34,20 +36,25 @@ def register_part_imagenet(name, annotations_dirname, split, label_percentage: int=100, - debug=False, - ): + debug=False, + ): assert isinstance(name, str), name assert isinstance(images_dirname, str), images_dirname assert isinstance(annotations_dirname, str), annotations_dirname if len(images_dirname) == 0: - images_dirname = IMAGENET_IMAGE_DIRNAME + images_dirname = IMAGENET_IMAGE_DIRNAME if len(annotations_dirname) == 0: annotations_dirname = PART_IMAGENET_ANNOTATION_ROOT json_file = os.path.join(annotations_dirname, split + ".json") DatasetCatalog.register(name, lambda: load_json_with_label_limit(json_file, images_dirname, name, label_percentage)) - # TODO: add part_classes field in metadata below. - MetadataCatalog.get(name).set(json_file=json_file, + + # class id is defined based on imagenet-1k idexing. + fname_to_classname = torch.load('datasets/metadata/imagenet_1k_fname_classname_dict.pkl') + class_code_to_class_id = {k: i for i, k in enumerate(list(fname_to_classname.keys()))} + MetadataCatalog.get(name).set(json_file=json_file, image_root=images_dirname, - # part_classes=None, + imagenet_1k_class_code_to_class_id=class_code_to_class_id ) + + diff --git a/part_distillation/data/datasets/register_pascal_parts.py b/part_distillation/data/datasets/register_pascal_parts.py index 3a0fae3..ac6aa69 100644 --- a/part_distillation/data/datasets/register_pascal_parts.py +++ b/part_distillation/data/datasets/register_pascal_parts.py @@ -3,16 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import torch import logging import os -import copy +import copy import numpy as np import pycocotools.mask as mask_util import scipy.io from typing import Any, Dict, List, Tuple, Union from detectron2.data import DatasetCatalog, MetadataCatalog -from detectron2.data.datasets.pascal_voc import CLASS_NAMES, load_voc_instances +from .pascal_voc import CLASS_NAMES, load_voc_instances from detectron2.structures import BoxMode from detectron2.utils.file_io import PathManager from .pascal_info import get_orig_part, categories @@ -35,7 +36,7 @@ def mask_to_bbox(mask: np.ndarray) -> Tuple[float, float, float, float]: return x1, y1, x2, y2 -def get_part_annotation_dict(part_instance: Any, subset_class_names: Union[List[str], Any], +def get_part_annotation_dict(part_instance: Any, subset_class_names: Union[List[str], Any], encode=True, subset_part_name_to_ids={}) -> Tuple[Dict, List]: class_name = part_instance[0][0] if class_name == "table": @@ -57,7 +58,7 @@ def get_part_annotation_dict(part_instance: Any, subset_class_names: Union[List[ { "part_category": p[0][0], "orig_part_category": orig_part_name, - "orig_part_category_id": subset_part_name_to_ids[orig_part_name], + "orig_part_category_id": subset_part_name_to_ids[orig_part_name], "bbox": mask_to_bbox(p[1]), "bbox_mode": BoxMode.XYXY_ABS, "segmentation": mask_util.encode(p[1]) if encode else p[1], @@ -100,19 +101,20 @@ def load_pascal_parts_instances( dict["annotations"] = [] for inst in instances: if inst[0][0] in subset_class_names: - object_annotation, part_annotations = get_part_annotation_dict(inst, - subset_class_names=subset_class_names, + object_annotation, part_annotations = get_part_annotation_dict(inst, + subset_class_names=subset_class_names, subset_part_name_to_ids=subset_part_name_to_ids) - - # for segmentation, each instance is saved in a separate dict. + + # for segmentation, each instance is saved in a separate dict. if for_segmentation: new_dict = copy.deepcopy(dict) - - # some object has no parts. + + # some object has no parts. if len(part_annotations) > 0: new_dict["annotations"].append(object_annotation) new_dict["part_annotations"].append(part_annotations) final_dicts.append(new_dict) + num_found += 1 else: if len(part_annotations) > 0: dict["annotations"].append(object_annotation) @@ -124,7 +126,7 @@ def load_pascal_parts_instances( num_found += 1 if label_percentage < 100: - # shuffle and pick first n. + # shuffle and pick first n. np.random.seed(1234) np.random.shuffle(final_dicts) @@ -144,41 +146,41 @@ def register_pascal_parts( subset_class_names=None, label_percentage: int=100, for_segmentation: bool=False, - debug=False, -): + debug=False, +): """ subset_class_names: Subset of PascalParts classes to use, - label_percentage: Percentage of labels to register. Used for few-shot learning. - for_segmentation: For segmentation evaluation, each image has one object instance. - Dataset will then have duplicate images. - debug: For quick dubugging, only register a small portion. + label_percentage: Percentage of labels to register. Used for few-shot learning. + for_segmentation: For segmentation evaluation, each image has one object instance. + Dataset will then have duplicate images. + debug: For quick dubugging, only register a small portion. """ if len(images_dirname) == 0: - images_dirname = PASCALPARTS_DATASET_PATH + images_dirname = PASCALPARTS_DATASET_PATH if len(annotations_dirname) == 0: - annotations_dirname = PASCALPARTS_ANNOTATION_PATH + annotations_dirname = PASCALPARTS_ANNOTATION_PATH if subset_class_names is not None and len(subset_class_names) > 0: subset_class_names = sorted(subset_class_names) else: - subset_class_names = CLASS_NAMES + subset_class_names = CLASS_NAMES pid = 0 subset_part_name_to_ids = {} for class_name in subset_class_names: if class_name == "table": class_name = "diningtable" - - # part IDs are re-defined for subset classes. + + # part IDs are re-defined for subset classes. for part in OBJ_NAMES_TO_PART_NAMES_DICT[class_name]: pname = part.orig_name if pname not in subset_part_name_to_ids: - subset_part_name_to_ids[pname] = pid - pid += 1 + subset_part_name_to_ids[pname] = pid + pid += 1 DatasetCatalog.register( name, lambda: load_pascal_parts_instances( - images_dirname, annotations_dirname, split, + images_dirname, annotations_dirname, split, subset_class_names=subset_class_names, subset_part_name_to_ids=subset_part_name_to_ids, label_percentage=label_percentage, @@ -187,11 +189,13 @@ def register_pascal_parts( ), ) MetadataCatalog.get(name).set( - thing_classes=list(subset_class_names), + thing_classes=list(subset_class_names), part_classes=list(subset_part_name_to_ids.keys()), - classes=list(subset_part_name_to_ids.keys()), + classes=list(subset_part_name_to_ids.keys()), dirname=images_dirname, annotations_dirname=annotations_dirname, year=year, split=split, ) + + diff --git a/part_distillation/demo/part_distillation_predictor.py b/part_distillation/demo/part_distillation_predictor.py new file mode 100644 index 0000000..3d2ffb1 --- /dev/null +++ b/part_distillation/demo/part_distillation_predictor.py @@ -0,0 +1,241 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# From https://github.com/facebookresearch/Detic/blob/main/detic/predictor.py. +# Modified by Jang Hyun Cho. + +import atexit +import bisect +import multiprocessing as mp +from collections import deque +import cv2 +import torch + +from detectron2.data import MetadataCatalog +from detectron2.engine.defaults import DefaultPredictor +from detectron2.utils.video_visualizer import VideoVisualizer +from detectron2.utils.visualizer import ColorMode, Visualizer +from detectron2.structures import BitMasks, Instances +from Detic.detic.modeling.utils import reset_cls_test +from continuously_postprocess_dcrf import dense_crf + +class CustomPredictor(DefaultPredictor): + """ + D2's DefaultPredictor but takes arbitrary input as argument and add to input to model. + """ + def reshape_image(self, image): + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + image = image[:, :, ::-1] + image = self.aug.get_transform(image).apply_image(image) + image = image[:, :, ::-1] + return image + + def __call__(self, original_image, additional_input={}): + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + + image = self.aug.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + height, width = image.shape[1:] + + inputs = {"image": image, "height": height, "width": width} + inputs.update(additional_input) + predictions = self.model([inputs])[0] + return predictions + + + +def get_clip_embeddings(vocabulary, prompt='a '): + from Detic.detic.modeling.text.text_encoder import build_text_encoder + text_encoder = build_text_encoder(pretrain=True) + text_encoder.eval() + texts = [prompt + x for x in vocabulary] + emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() + return emb + +BUILDIN_CLASSIFIER = { + 'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy', + 'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy', + 'openimages': 'datasets/metadata/oid_clip_a+cname.npy', + 'coco': 'datasets/metadata/coco_clip_a+cname.npy', +} + +BUILDIN_METADATA_PATH = { + 'lvis': 'lvis_v1_val', + 'objects365': 'objects365_v2_val', + 'openimages': 'oid_val_expanded', + 'coco': 'coco_2017_val', +} + +class PartVisualizationDemo(object): + def __init__(self, object_cfg, part_cfg, args, + instance_mode=ColorMode.IMAGE, parallel=False): + """ + Args: + object_cfg (CfgNode): + part_cfg (CfgNode): + instance_mode (ColorMode): + parallel (bool): whether to run the model in different processes from visualization. + Useful since the visualization logic can be slow. + """ + if args.vocabulary == 'custom': + self.metadata = MetadataCatalog.get("__unused") + self.metadata.thing_classes = args.custom_vocabulary.split(',') + classifier = get_clip_embeddings(self.metadata.thing_classes) + else: + self.metadata = MetadataCatalog.get( + BUILDIN_METADATA_PATH[args.vocabulary]) + classifier = BUILDIN_CLASSIFIER[args.vocabulary] + + num_classes = len(self.metadata.thing_classes) + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + self.dcrf = args.dcrf + self.parallel = parallel + if parallel: + num_gpu = torch.cuda.device_count() + self.object_predictor = AsyncPredictor(object_cfg, num_gpus=num_gpu) + self.part_predictor = AsyncPredictor(part_cfg, num_gpus=num_gpu) + else: + self.object_predictor = CustomPredictor(object_cfg) + self.part_predictor = CustomPredictor(part_cfg) + reset_cls_test(self.object_predictor.model, classifier, num_classes) + + def run_on_image(self, image): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + vis_output = None + object_prediction = self.object_predictor(image) + + masks = object_prediction["instances"].pred_masks + scores = object_prediction["instances"].scores + topk_idxs = scores.topk(1)[1].flatten() + masks_selected = masks[topk_idxs] + part_instance = Instances(object_prediction["instances"].image_size) + part_instance.gt_masks = BitMasks(masks_selected) + part_instance.gt_classes = torch.zeros(1) + object_instance = Instances(object_prediction["instances"].image_size) + object_instance.gt_masks = BitMasks(masks_selected) + object_instance.gt_classes = torch.zeros(1) + object_input = {"instances": object_instance, "part_instances": part_instance} + # print(masks_selected.shape, object_prediction["instances"].image_size, image.shape, flush=True) + predictions = self.part_predictor(image, object_input) + + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = self.part_predictor.reshape_image(image) + image = image[:, :, ::-1] + visualizer = Visualizer(image, None, instance_mode=self.instance_mode) + instances = predictions["proposals"].to(self.cpu_device) + if self.dcrf: + bmask = instances.pred_masks + num_c = bmask.shape[0] + cmask = (bmask * (torch.arange(num_c) + 1)[:, None, None]).sum(0) + cmask = torch.tensor(dense_crf(image, cmask, num_c + 1)) + o_cls = cmask.unique() + o_cls = o_cls[o_cls != 0] + bmask = torch.zeros(len(o_cls), *cmask.shape).bool() + for i, c in enumerate(o_cls): + bmask[i] = cmask == c + instances.pred_masks = bmask + vis_output = visualizer.draw_instance_predictions(predictions=instances) + + return predictions, vis_output + +class AsyncPredictor: + """ + A predictor that runs the model asynchronously, possibly on >1 GPUs. + Because rendering the visualization takes considerably amount of time, + this helps improve throughput a little bit when rendering videos. + """ + + class _StopToken: + pass + + class _PredictWorker(mp.Process): + def __init__(self, cfg, task_queue, result_queue): + self.cfg = cfg + self.task_queue = task_queue + self.result_queue = result_queue + super().__init__() + + def run(self): + predictor = CustomPredictor(self.cfg) + + while True: + task = self.task_queue.get() + if isinstance(task, AsyncPredictor._StopToken): + break + idx, data = task + result = predictor(data) + self.result_queue.put((idx, result)) + + def __init__(self, cfg, num_gpus: int = 1): + """ + Args: + cfg (CfgNode): + num_gpus (int): if 0, will run on CPU + """ + num_workers = max(num_gpus, 1) + self.task_queue = mp.Queue(maxsize=num_workers * 3) + self.result_queue = mp.Queue(maxsize=num_workers * 3) + self.procs = [] + for gpuid in range(max(num_gpus, 1)): + cfg = cfg.clone() + cfg.defrost() + cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) + + self.put_idx = 0 + self.get_idx = 0 + self.result_rank = [] + self.result_data = [] + + for p in self.procs: + p.start() + atexit.register(self.shutdown) + + def put(self, image): + self.put_idx += 1 + self.task_queue.put((self.put_idx, image)) + + def get(self): + self.get_idx += 1 # the index needed for this request + if len(self.result_rank) and self.result_rank[0] == self.get_idx: + res = self.result_data[0] + del self.result_data[0], self.result_rank[0] + return res + + while True: + # make sure the results are returned in the correct order + idx, res = self.result_queue.get() + if idx == self.get_idx: + return res + insert = bisect.bisect(self.result_rank, idx) + self.result_rank.insert(insert, idx) + self.result_data.insert(insert, res) + + def __len__(self): + return self.put_idx - self.get_idx + + def __call__(self, image): + self.put(image) + return self.get() + + def shutdown(self): + for _ in self.procs: + self.task_queue.put(AsyncPredictor._StopToken()) + + @property + def default_buffer_size(self): + return len(self.procs) * 5 \ No newline at end of file diff --git a/part_distillation/demo/part_segment_predictor.py b/part_distillation/demo/part_segment_predictor.py new file mode 100644 index 0000000..a077338 --- /dev/null +++ b/part_distillation/demo/part_segment_predictor.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# From https://github.com/facebookresearch/Detic/blob/main/detic/predictor.py. +# Modified by Jang Hyun Cho. + +import atexit +import bisect +import multiprocessing as mp +from collections import deque +import cv2 +import torch + +from detectron2.data import MetadataCatalog +from detectron2.engine.defaults import DefaultPredictor +from detectron2.utils.video_visualizer import VideoVisualizer +from detectron2.utils.visualizer import ColorMode, Visualizer +from detectron2.structures import BitMasks, Instances +from Detic.detic.modeling.utils import reset_cls_test +from continuously_postprocess_dcrf import dense_crf + +class CustomPredictor(DefaultPredictor): + """ + D2's DefaultPredictor but takes arbitrary input as argument and add to input to model. + """ + def reshape_image(self, image): + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + image = image[:, :, ::-1] + image = self.aug.get_transform(image).apply_image(image) + image = image[:, :, ::-1] + return image + + def __call__(self, original_image, additional_input={}): + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + + image = self.aug.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + height, width = image.shape[1:] + + inputs = {"image": image, "height": height, "width": width} + inputs.update(additional_input) + predictions = self.model([inputs])[0] + return predictions + + + +def get_clip_embeddings(vocabulary, prompt='a '): + from Detic.detic.modeling.text.text_encoder import build_text_encoder + text_encoder = build_text_encoder(pretrain=True) + text_encoder.eval() + texts = [prompt + x for x in vocabulary] + emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() + return emb + +BUILDIN_CLASSIFIER = { + 'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy', + 'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy', + 'openimages': 'datasets/metadata/oid_clip_a+cname.npy', + 'coco': 'datasets/metadata/coco_clip_a+cname.npy', +} + +BUILDIN_METADATA_PATH = { + 'lvis': 'lvis_v1_val', + 'objects365': 'objects365_v2_val', + 'openimages': 'oid_val_expanded', + 'coco': 'coco_2017_val', +} + +class PartVisualizationDemo(object): + def __init__(self, object_cfg, part_cfg, args, + instance_mode=ColorMode.IMAGE, parallel=False): + """ + Args: + object_cfg (CfgNode): + part_cfg (CfgNode): + instance_mode (ColorMode): + parallel (bool): whether to run the model in different processes from visualization. + Useful since the visualization logic can be slow. + """ + if args.vocabulary == 'custom': + self.metadata = MetadataCatalog.get("__unused") + self.metadata.thing_classes = args.custom_vocabulary.split(',') + classifier = get_clip_embeddings(self.metadata.thing_classes) + else: + self.metadata = MetadataCatalog.get( + BUILDIN_METADATA_PATH[args.vocabulary]) + classifier = BUILDIN_CLASSIFIER[args.vocabulary] + + num_classes = len(self.metadata.thing_classes) + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + self.dcrf = args.dcrf + self.parallel = parallel + if parallel: + num_gpu = torch.cuda.device_count() + self.object_predictor = AsyncPredictor(object_cfg, num_gpus=num_gpu) + self.part_predictor = AsyncPredictor(part_cfg, num_gpus=num_gpu) + else: + self.object_predictor = CustomPredictor(object_cfg) + self.part_predictor = CustomPredictor(part_cfg) + reset_cls_test(self.object_predictor.model, classifier, num_classes) + + def run_on_image(self, image): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + vis_output = None + object_prediction = self.object_predictor(image) + + masks = object_prediction["instances"].pred_masks + scores = object_prediction["instances"].scores + topk_idxs = scores.topk(1)[1].flatten() + masks_selected = masks[topk_idxs] + part_instance = Instances(object_prediction["instances"].image_size) + part_instance.gt_masks = BitMasks(masks_selected) + part_instance.gt_classes = torch.zeros(1) + object_instance = Instances(object_prediction["instances"].image_size) + object_instance.gt_masks = BitMasks(masks_selected) + object_instance.gt_classes = torch.zeros(1) + object_input = {"instances": object_instance, "part_instances": part_instance} + # print(masks_selected.shape, object_prediction["instances"].image_size, image.shape, flush=True) + predictions = self.part_predictor(image, object_input) + + + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = self.part_predictor.reshape_image(image) + image = image[:, :, ::-1] + visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) + instances = predictions["proposals"].to(self.cpu_device) + if self.dcrf: + bmask = instances.pred_masks + num_c = bmask.shape[0] + cmask = (bmask * (torch.arange(num_c) + 1)[:, None, None]).sum(0) + cmask = torch.tensor(dense_crf(image, cmask, num_c + 1)) + o_cls = cmask.unique() + o_cls = o_cls[o_cls != 0] + bmask = torch.zeros(len(o_cls), *cmask.shape).bool() + for i, c in enumerate(o_cls): + bmask[i] = cmask == c + instances.pred_masks = bmask + vis_output = visualizer.draw_instance_predictions(predictions=instances) + + return predictions, vis_output + +class AsyncPredictor: + """ + A predictor that runs the model asynchronously, possibly on >1 GPUs. + Because rendering the visualization takes considerably amount of time, + this helps improve throughput a little bit when rendering videos. + """ + + class _StopToken: + pass + + class _PredictWorker(mp.Process): + def __init__(self, cfg, task_queue, result_queue): + self.cfg = cfg + self.task_queue = task_queue + self.result_queue = result_queue + super().__init__() + + def run(self): + predictor = CustomPredictor(self.cfg) + + while True: + task = self.task_queue.get() + if isinstance(task, AsyncPredictor._StopToken): + break + idx, data = task + result = predictor(data) + self.result_queue.put((idx, result)) + + def __init__(self, cfg, num_gpus: int = 1): + """ + Args: + cfg (CfgNode): + num_gpus (int): if 0, will run on CPU + """ + num_workers = max(num_gpus, 1) + self.task_queue = mp.Queue(maxsize=num_workers * 3) + self.result_queue = mp.Queue(maxsize=num_workers * 3) + self.procs = [] + for gpuid in range(max(num_gpus, 1)): + cfg = cfg.clone() + cfg.defrost() + cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) + + self.put_idx = 0 + self.get_idx = 0 + self.result_rank = [] + self.result_data = [] + + for p in self.procs: + p.start() + atexit.register(self.shutdown) + + def put(self, image): + self.put_idx += 1 + self.task_queue.put((self.put_idx, image)) + + def get(self): + self.get_idx += 1 # the index needed for this request + if len(self.result_rank) and self.result_rank[0] == self.get_idx: + res = self.result_data[0] + del self.result_data[0], self.result_rank[0] + return res + + while True: + # make sure the results are returned in the correct order + idx, res = self.result_queue.get() + if idx == self.get_idx: + return res + insert = bisect.bisect(self.result_rank, idx) + self.result_rank.insert(insert, idx) + self.result_data.insert(insert, res) + + def __len__(self): + return self.put_idx - self.get_idx + + def __call__(self, image): + self.put(image) + return self.get() + + def shutdown(self): + for _ in self.procs: + self.task_queue.put(AsyncPredictor._StopToken()) + + @property + def default_buffer_size(self): + return len(self.procs) * 5 \ No newline at end of file diff --git a/part_distillation/evaluation/__init__.py b/part_distillation/evaluation/__init__.py index 1090368..89423f2 100644 --- a/part_distillation/evaluation/__init__.py +++ b/part_distillation/evaluation/__init__.py @@ -3,9 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + from .proposal_evaluator import ProposalEvaluator -from .miou_matcher import mIOU_Matcher -from .miou_evaluator import mIOU_Evaluator -from .clustering_module import ClusteringModule +from .miou_matcher import mIOU_Matcher +from .miou_evaluator import mIOU_Evaluator +from .clustering_module import ClusteringModule from .null_evaluator import NullEvaluator -from .supervised_miou_evaluator import Supervised_mIOU_Evaluator +from .supervised_miou_evaluator import Supervised_mIOU_Evaluator \ No newline at end of file diff --git a/part_distillation/evaluation/clustering_module.py b/part_distillation/evaluation/clustering_module.py index 7f5f54e..39c5768 100644 --- a/part_distillation/evaluation/clustering_module.py +++ b/part_distillation/evaluation/clustering_module.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import itertools import logging @@ -12,13 +13,13 @@ from detectron2.utils.comm import is_main_process, synchronize, all_gather from detectron2.evaluation import DatasetEvaluator -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans class ClusteringModule(DatasetEvaluator): - def __init__(self, - distributed=True, - num_clusters=8, + def __init__(self, + distributed=True, + num_clusters=8, ): self._logger = logging.getLogger("part_distillation") self._distributed = distributed @@ -48,12 +49,12 @@ def evaluate(self): gt_labels = all_gather(self._class_labels_list) gt_labels = list(itertools.chain(*gt_labels)) - + proposal_features = torch.cat(proposal_features, dim=0) gt_labels = torch.cat(gt_labels, dim=0) gt_unique = gt_labels.unique().long().numpy() - # only run the main process since clustering is on cpu. + # only run the main process since clustering is on cpu. cluster_centroids_dict = {} if comm.is_main_process(): for cid in gt_unique: @@ -62,12 +63,12 @@ def evaluate(self): cluster_centroids_dict[cid] = self._get_cluster_centroids(proposal_features_i, cid) else: cluster_centroids_dict[cid] = torch.randn(self.num_clusters, proposal_features_i.shape[1]) - + synchronize() cluster_centroids_dict = all_gather(cluster_centroids_dict) cluster_centroids_dict = cluster_centroids_dict[0] # 0 is the main process. - - return copy.deepcopy(cluster_centroids_dict) + + return copy.deepcopy(cluster_centroids_dict) def _get_cluster_centroids(self, proposal_features, cid): @@ -77,5 +78,6 @@ def _get_cluster_centroids(self, proposal_features, cid): centroids = kmeans.cluster_centers_ centroids = torch.tensor(centroids).float() - + return centroids + diff --git a/part_distillation/evaluation/miou_evaluator.py b/part_distillation/evaluation/miou_evaluator.py index aefa8a0..5370c1f 100644 --- a/part_distillation/evaluation/miou_evaluator.py +++ b/part_distillation/evaluation/miou_evaluator.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import os import numpy as np @@ -43,10 +44,10 @@ def process(self, inputs: List[Dict], outputs: Dict): pred_instances = output_per_image["predictions"].to(self._cpu_device) gt_instances = output_per_image["gt_instances"].to(self._cpu_device) - pred_masks = pred_instances.pred_masks - pred_classes = pred_instances.pred_classes - gt_masks = gt_instances.gt_masks - gt_classes = gt_instances.gt_classes + pred_masks = pred_instances.pred_masks + pred_classes = pred_instances.pred_classes + gt_masks = gt_instances.gt_masks + gt_classes = gt_instances.gt_classes gt_object_class = output_per_image["gt_object_label"].item() assert pred_masks.shape[1:] == gt_masks.shape[1:] @@ -66,8 +67,8 @@ def process(self, inputs: List[Dict], outputs: Dict): def _binary_mask_to_semseg(self, masks, classes): semseg = torch.full(masks.shape[1:], fill_value=self.gt_num_classes) for i, c in enumerate(classes): - semseg[torch.where(masks[i]==True)] = c - return semseg + semseg[torch.where(masks[i]==True)] = c + return semseg def evaluate(self): self._logger.info("Start evaluating ...") @@ -80,9 +81,9 @@ def evaluate(self): classes_used_total = all_gather(self._classes_used) for cset in classes_used_total: _classes_used = _classes_used.union(cset) - self._classes_used = _classes_used + self._classes_used = _classes_used - synchronize() + synchronize() for k in self._classes_used: if k not in self._conf_matrix: self._conf_matrix[k] = np.zeros((self.gt_num_classes + 1, self.gt_num_classes + 1), \ @@ -103,7 +104,7 @@ def evaluate(self): seg_results_all["A-mACC"].extend([v for k, v in seg_results.items() if "ACC-" in k and not np.isnan(v)]) seg_results_all["C-mIoPred"].append(seg_results["mIoPred"]) seg_results_all["A-mIoPred"].extend([v for k, v in seg_results.items() if "IoPred-" in k and not np.isnan(v)]) - + seg_results_all["C-mIoU"] = np.mean(seg_results_all["C-mIoU"]) seg_results_all["A-mIoU"] = np.mean(seg_results_all["A-mIoU"]) seg_results_all["C-mIoPred"] = np.mean(seg_results_all["C-mIoPred"]) @@ -123,7 +124,7 @@ def measure_mIOU(self, conf_matrix): bg considered separately class_names: List with names of forground classes """ - num_classes = self.gt_num_classes + num_classes = self.gt_num_classes class_names = self._class_names acc = np.full(num_classes, np.nan, dtype=float) diff --git a/part_distillation/evaluation/miou_matcher.py b/part_distillation/evaluation/miou_matcher.py index 53cf21e..d35ef21 100644 --- a/part_distillation/evaluation/miou_matcher.py +++ b/part_distillation/evaluation/miou_matcher.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import os import torch @@ -28,10 +29,10 @@ def __init__( self._cpu_device = torch.device("cpu") metadata = MetadataCatalog.get(dataset_name) - self.pred_num_classes = num_classes + self.pred_num_classes = num_classes self.gt_num_classes = len(metadata.part_classes) if hasattr(metadata, "part_classes") \ else len(metadata.thing_classes) - self._class_names = metadata.thing_classes + self._class_names = metadata.thing_classes self.n = max(self.gt_num_classes, self.pred_num_classes) self._logger.info("mIOU-matcher initialized (n:{}/gt:{}/pd:{})."\ .format(self.n, self.gt_num_classes, self.pred_num_classes)) @@ -45,11 +46,11 @@ def process(self, inputs: List[Dict], outputs: Dict): pred_instances = output_per_image["predictions"].to(self._cpu_device) gt_instances = output_per_image["gt_instances"].to(self._cpu_device) - pred_masks = pred_instances.pred_masks - pred_classes = pred_instances.pred_classes - gt_masks = gt_instances.gt_masks - gt_classes = gt_instances.gt_classes - gt_object_class = output_per_image["gt_object_label"].item() + pred_masks = pred_instances.pred_masks + pred_classes = pred_instances.pred_classes + gt_masks = gt_instances.gt_masks + gt_classes = gt_instances.gt_classes + gt_object_class = output_per_image["gt_object_label"].item() assert pred_masks.shape[1:] == gt_masks.shape[1:], '{} != {}'.format(pred_masks.shape, gt_masks.shape) if gt_object_class not in self._conf_matrix: @@ -60,7 +61,7 @@ def process(self, inputs: List[Dict], outputs: Dict): conf_matrix_i = np.bincount( (self.n + 1) * pd.reshape(-1) + gt.reshape(-1), - minlength=self._conf_matrix[gt_object_class].size, + minlength=self._conf_matrix[gt_object_class].size, ).reshape(self._conf_matrix[gt_object_class].shape) self._conf_matrix[gt_object_class] += conf_matrix_i @@ -70,8 +71,8 @@ def process(self, inputs: List[Dict], outputs: Dict): def _binary_mask_to_semseg(self, masks, classes): semseg = torch.full(masks.shape[1:], fill_value=self.n) for i, c in enumerate(classes): - semseg[torch.where(masks[i]==True)] = c - return semseg + semseg[torch.where(masks[i]==True)] = c + return semseg def evaluate(self): @@ -83,9 +84,9 @@ def evaluate(self): classes_used_total = all_gather(self._classes_used) for cset in classes_used_total: _classes_used = _classes_used.union(cset) - self._classes_used = _classes_used + self._classes_used = _classes_used - synchronize() + synchronize() for k in self._classes_used: if k not in self._conf_matrix: self._conf_matrix[k] = np.zeros((self.n + 1, self.n + 1), \ @@ -98,7 +99,7 @@ def evaluate(self): _conf_matrix = np.zeros_like(self._conf_matrix[k]) for conf_matrix in conf_matrix_list: _conf_matrix += conf_matrix - + matching_mapper_dict[k] = self.majority_voting(_conf_matrix) return matching_mapper_dict diff --git a/part_distillation/evaluation/null_evaluator.py b/part_distillation/evaluation/null_evaluator.py index 4a52249..e835796 100644 --- a/part_distillation/evaluation/null_evaluator.py +++ b/part_distillation/evaluation/null_evaluator.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + from typing import Dict, List from detectron2.utils.comm import synchronize from detectron2.evaluation.evaluator import DatasetEvaluator @@ -10,11 +11,11 @@ class NullEvaluator(DatasetEvaluator): def reset(self): - return + return def process(self, inputs: List[Dict], outputs: Dict): - return - + return + def evaluate(self): synchronize() - return + return \ No newline at end of file diff --git a/part_distillation/evaluation/proposal_evaluator.py b/part_distillation/evaluation/proposal_evaluator.py index f909fe4..13cc40c 100644 --- a/part_distillation/evaluation/proposal_evaluator.py +++ b/part_distillation/evaluation/proposal_evaluator.py @@ -3,14 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import os import itertools -import copy +import copy import json import logging import numpy as np import torch -import time +import time from collections import OrderedDict from typing import Dict, List @@ -54,30 +55,30 @@ def _evaluate_box_proposals(proposals_list, gt_masks_list, area="all", limit=Non assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] - gt_overlaps = [] - num_pos = 0 + gt_overlaps = [] + num_pos = 0 time_for_iou = [] - for (proposals_mask, proposals_score), gt_masks in zip(proposals_list, gt_masks_list): + for (proposals_mask, proposals_score), gt_masks in zip(proposals_list, gt_masks_list): inds = proposals_score.sort(descending=True)[1] proposals = [proposals_mask[i] for i in inds] if len(proposals) == 0 or len(gt_masks) == 0: - continue + continue gt_areas = gt_masks.float().flatten(1).sum(-1) - valid_gt_inds = (gt_areas > area_range[0]) & (gt_areas <= area_range[1]) + valid_gt_inds = (gt_areas > area_range[0]) & (gt_areas <= area_range[1]) gt_masks = gt_masks[valid_gt_inds] num_pos += len(gt_masks) if len(gt_masks) == 0: - continue - + continue + if limit is not None and len(proposals) > limit: - proposals = proposals[:limit] - - t1 = time.time() + proposals = proposals[:limit] + + t1 = time.time() overlaps = pairwise_mask_iou_cocoapi(proposals, gt_masks) # 20x faster. time_for_iou.append(time.time()-t1) @@ -133,18 +134,18 @@ def __init__( limit: int=-1, ): """ - This evaluator evaluates baseline methods. + This evaluator evaluates baseline methods. - The evaluation is on AR metric. + The evaluation is on AR metric. - This evaluator will handle subset_class evaluation as well. + This evaluator will handle subset_class evaluation as well. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir self._cpu_device = torch.device("cpu") - self.areas = areas + self.areas = areas self.limit = limit def reset(self): @@ -181,15 +182,15 @@ def evaluate(self): gts = list(itertools.chain(*gts)) if not is_main_process(): - return {} + return {} else: - predictions = self._predictions - gts = self._gts + predictions = self._predictions + gts = self._gts if len(predictions) == 0: self._logger.warning("[ProposalEvaluator] Did not receive valid predictions.") return {} - + self._results = OrderedDict() self._eval_proposals(predictions, gts) diff --git a/part_distillation/evaluation/supervised_miou_evaluator.py b/part_distillation/evaluation/supervised_miou_evaluator.py index 997f554..4cf7c2f 100644 --- a/part_distillation/evaluation/supervised_miou_evaluator.py +++ b/part_distillation/evaluation/supervised_miou_evaluator.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import os import numpy as np @@ -29,7 +30,7 @@ def __init__( self._output_dir = output_dir self._cpu_device = torch.device("cpu") self._num_classes = num_classes - self._class_names = MetadataCatalog.get(dataset_name).thing_classes + self._class_names = MetadataCatalog.get(dataset_name).thing_classes self._logger.info("Supervised mIOU-evaluator initialized (gt:{}).".format(self._num_classes)) def reset(self): @@ -40,9 +41,9 @@ def process(self, inputs: List[Dict], outputs: Dict): pred_instances = output_per_image["predictions"].to(self._cpu_device) gt_instances = output_per_image["gt_instances"].to(self._cpu_device) - pred_masks = pred_instances.pred_masks - pred_classes = pred_instances.pred_classes - gt_masks = gt_instances.gt_masks + pred_masks = pred_instances.pred_masks + pred_classes = pred_instances.pred_classes + gt_masks = gt_instances.gt_masks gt_classes = gt_instances.gt_classes assert pred_masks.shape[1:] == gt_masks.shape[1:] @@ -58,8 +59,8 @@ def process(self, inputs: List[Dict], outputs: Dict): def _binary_mask_to_semseg(self, masks, classes): semseg = torch.full(masks.shape[1:], fill_value=self._num_classes) for i, c in enumerate(classes): - semseg[torch.where(masks[i]==True)] = c - return semseg + semseg[torch.where(masks[i]==True)] = c + return semseg def evaluate(self): seg_results_all = {"mIoU": [], @@ -78,7 +79,7 @@ def evaluate(self): seg_results_all["mIoU"].extend([v for k, v in seg_results.items() if "IoU-" in k and not np.isnan(v)]) seg_results_all["mACC"].append(seg_results["mACC"]) seg_results_all["mIoPred"].append(seg_results["mIoPred"]) - + seg_results_all["mIoU"] = np.mean(seg_results_all["mIoU"]) seg_results_all["mIoPred"] = np.mean(seg_results_all["mIoPred"]) seg_results_all["mACC"] = np.mean(seg_results_all["mACC"]) @@ -94,7 +95,7 @@ def measure_mIOU(self, conf_matrix): bg considered separately class_names: List with names of forground classes """ - num_classes = self._num_classes + num_classes = self._num_classes class_names = self._class_names acc = np.full(num_classes, np.nan, dtype=float) diff --git a/part_distillation/labeling_detic.py b/part_distillation/labeling_detic.py index 96d1532..2a3f129 100644 --- a/part_distillation/labeling_detic.py +++ b/part_distillation/labeling_detic.py @@ -3,19 +3,20 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import copy import logging import numpy as np import torch -import torch.nn.functional as F -import sys -import os +import torch.nn.functional as F +import sys +import os from typing import Dict, List, Optional, Tuple from detectron2.data import transforms as T from detectron2.structures import Boxes, Instances from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY -from Detic.detic.modeling.meta_arch.custom_rcnn import CustomRCNN +from Detic.detic.modeling.meta_arch.custom_rcnn import CustomRCNN from .utils.utils import proposals_to_coco_json from detectron2.data import detection_utils as utils @@ -27,7 +28,7 @@ def inference( self, batched_inputs: Tuple[Dict[str, torch.Tensor]], detected_instances: Optional[List[Instances]] = None, - do_postprocess: bool = True, + do_postprocess: bool = True, ): assert not self.training assert detected_instances is None @@ -35,42 +36,50 @@ def inference( images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) proposals, _ = self.proposal_generator(images, features, None) - results, _ = self.roi_heads(images, features, proposals) + + class_ids = None + if self.labeling_mode == 'max-gt-label': + class_ids = images.tensor.new_tensor([self.metadata.class_code_to_class_id[input_per_image["class_code"]] \ + for input_per_image in batched_inputs]).long() + results, _ = self.roi_heads(images, features, proposals, class_ids=class_ids) if do_postprocess: assert not torch.jit.is_scripting(), \ "Scripting is not supported for postprocess." results = CustomRCNN._postprocess( results, batched_inputs, images.image_sizes) - + output_list = self.save_detic_prediction(batched_inputs, results) return output_list - def register_metadata(self, metadata, debug): + def register_metadata(self, metadata, labeling_mode, score_thres, debug): self.metadata = metadata - self.root_save_path = metadata.save_path + self.root_save_path = metadata.save_path + self.labeling_mode = labeling_mode + self.score_thres = score_thres self.debug = debug def save_detic_prediction(self, batched_inputs, results): """ - results: list[Dict(Instances)]. + results: list[Dict(Instances)]. each result has: - result.pred_boxes - - result.scores + - result.scores - result.pred_classes - - result.pred_masks + - result.pred_masks """ output_list = [] for input_per_image, instance_dict in zip(batched_inputs, results): gt_class = self.metadata.class_code_to_class_id[input_per_image["class_code"]] pred_classes = instance_dict["instances"].pred_classes.cpu() + # print(pred_classes, gt_class, input_per_image["class_code"], flush=True) idxs = pred_classes == gt_class - if idxs.any(): + if idxs.any() and self.labeling_mode == 'max-gt-label': masks = instance_dict["instances"].pred_masks.cpu()[idxs] scores = instance_dict["instances"].scores.cpu()[idxs] boxes = instance_dict["instances"].pred_boxes.tensor.cpu()[idxs] @@ -81,8 +90,9 @@ def save_detic_prediction(self, batched_inputs, results): scores_selected = scores[topk_idxs] boxes_selected = boxes[topk_idxs] pred_classes = pred_classes[idxs][topk_idxs] - pred_names = [self.metadata.classes[i] for i in pred_classes] + pred_names = [self.metadata.class_names[i] for i in pred_classes] else: + # no output with gt-label or "max-score" mode masks = instance_dict["instances"].pred_masks.cpu() scores = instance_dict["instances"].scores.cpu() boxes = instance_dict["instances"].pred_boxes.tensor.cpu() @@ -93,27 +103,43 @@ def save_detic_prediction(self, batched_inputs, results): scores_selected = scores[topk_idxs] boxes_selected = boxes[topk_idxs] pred_classes = pred_classes[topk_idxs] - pred_names = [self.metadata.classes[i] for i in pred_classes] - - H, W = masks_selected.shape[-2:] - res = {"file_name": input_per_image["file_name"], - "file_path": input_per_image["file_path"], - "class_code": input_per_image["class_code"], - "class_name": input_per_image["class_name"], - "object_masks": proposals_to_coco_json(masks_selected) \ - if not self.debug else masks_selected, - "object_boxes": boxes_selected, - "object_scores": scores_selected, - "height": H, - "width": W, - "pred_names": pred_names, - } - - if not self.debug: - torch.save(res, os.path.join(self.root_save_path, - input_per_image["class_code"], + pred_names = [self.metadata.class_names[i] for i in pred_classes] + + if self.labeling_mode == 'human-only': + assert max(pred_classes) < 5, '[human-only] mode only considers ["person", "man", "woman", "toddler", "human"]' + + filtered_idxs = scores_selected > self.score_thres + + res = None + if filtered_idxs.any(): + masks_selected = masks_selected[filtered_idxs] + scores_selected = scores_selected[filtered_idxs] + boxes_selected = boxes_selected[filtered_idxs] + pred_classes = pred_classes[filtered_idxs] + pred_names = [pred_names[i] for i in filtered_idxs] # list of str + + H, W = masks_selected.shape[-2:] + res = {"file_name": input_per_image["file_name"], + "file_path": input_per_image["file_path"], + "class_code": input_per_image["class_code"], + "class_name": input_per_image["class_name"], + "object_masks": proposals_to_coco_json(masks_selected) \ + if not self.debug else masks_selected, + "object_boxes": boxes_selected, + "object_scores": scores_selected, + "height": H, + "width": W, + "pred_names": pred_names, + } + + if not self.debug and res is not None: + torch.save(res, os.path.join(self.root_save_path, + input_per_image["class_code"], input_per_image["file_name"])) - + output_list.append(res) + + return output_list + + - return output_list diff --git a/part_distillation/modeling/__init__.py b/part_distillation/modeling/__init__.py index c9665de..822760b 100644 --- a/part_distillation/modeling/__init__.py +++ b/part_distillation/modeling/__init__.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + from .backbone.swin import D2SwinTransformer from .pixel_decoder.fpn import BasePixelDecoder from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder diff --git a/part_distillation/modeling/backbone/swin.py b/part_distillation/modeling/backbone/swin.py index 7acfc66..3b099d8 100644 --- a/part_distillation/modeling/backbone/swin.py +++ b/part_distillation/modeling/backbone/swin.py @@ -5,12 +5,8 @@ # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# Modified from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py import numpy as np import torch diff --git a/part_distillation/modeling/backbone/utils.py b/part_distillation/modeling/backbone/utils.py index 6230026..e9995d6 100644 --- a/part_distillation/modeling/backbone/utils.py +++ b/part_distillation/modeling/backbone/utils.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import torch import torch.nn as nn @@ -187,4 +183,4 @@ def forward(self, x): x = self.proj(x) # B C H W -> B H W C x = x.permute(0, 2, 3, 1) - return x + return x \ No newline at end of file diff --git a/part_distillation/modeling/criterion.py b/part_distillation/modeling/criterion.py index 31fa210..77a26f4 100644 --- a/part_distillation/modeling/criterion.py +++ b/part_distillation/modeling/criterion.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py """ MaskFormer criterion. @@ -130,6 +126,8 @@ def loss_labels(self, outputs, targets, indices, num_masks): assert "pred_logits" in outputs src_logits = outputs["pred_logits"].float() + # print(src_logits.shape, flush=True) + idx = self._get_src_permutation_idx(indices) if idx is not None: target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) @@ -137,13 +135,12 @@ def loss_labels(self, outputs, targets, indices, num_masks): src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device ) target_classes[idx] = target_classes_o - loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} else: losses = {"loss_ce": src_logits.sum()*0.0} return losses - + def loss_masks(self, outputs, targets, indices, num_masks): """Compute the losses related to the masks: the focal loss and the dice loss. targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] @@ -158,7 +155,7 @@ def loss_masks(self, outputs, targets, indices, num_masks): "loss_dice": outputs["pred_masks"].sum()*0.0, } - return losses + return losses src_masks = outputs["pred_masks"] src_masks = src_masks[src_idx] @@ -196,14 +193,12 @@ def loss_masks(self, outputs, targets, indices, num_masks): ).squeeze(1) losses = { - # "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), - # "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), - "loss_mask": sigmoid_ce_loss(point_logits, point_labels, num_masks), - "loss_dice": dice_loss(point_logits, point_labels, num_masks), + "loss_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks), + "loss_dice": dice_loss_jit(point_logits, point_labels, num_masks), + # "loss_mask": sigmoid_ce_loss(point_logits, point_labels, num_masks), + # "loss_dice": dice_loss(point_logits, point_labels, num_masks), } - del src_masks - del target_masks return losses def _get_src_permutation_idx(self, indices): diff --git a/part_distillation/modeling/matcher.py b/part_distillation/modeling/matcher.py index 2dae7f4..75c1f81 100644 --- a/part_distillation/modeling/matcher.py +++ b/part_distillation/modeling/matcher.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py """ Modules to compute the matching cost and solve the corresponding LSAP. @@ -95,7 +91,7 @@ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" - + @torch.no_grad() def memory_efficient_forward(self, outputs, targets): @@ -103,7 +99,7 @@ def memory_efficient_forward(self, outputs, targets): bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] - + # Iterate through batch size for b in range(bs): if outputs["pred_logits"][b].shape[-1] == 1: @@ -149,7 +145,7 @@ def memory_efficient_forward(self, outputs, targets): # Compute the dice loss betwen masks # cost_dice = batch_dice_loss_jit(out_mask, tgt_mask) cost_dice = batch_dice_loss(out_mask, tgt_mask) - + # Final cost matrix C = ( self.cost_mask * cost_mask @@ -157,9 +153,9 @@ def memory_efficient_forward(self, outputs, targets): + self.cost_dice * cost_dice ) C = C.reshape(num_queries, -1).cpu() - - row, col = linear_sum_assignment(C) - sorted_idx = C[row, col].topk(len(row), largest=False)[1] + + row, col = linear_sum_assignment(C) + sorted_idx = C[row, col].topk(len(row), largest=False)[1] indices.append((row[sorted_idx], col[sorted_idx])) return [ diff --git a/part_distillation/modeling/meta_arch/mask_former_head.py b/part_distillation/modeling/meta_arch/mask_former_head.py index 96c843a..87444e2 100644 --- a/part_distillation/modeling/meta_arch/mask_former_head.py +++ b/part_distillation/modeling/meta_arch/mask_former_head.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. import logging from copy import deepcopy from typing import Callable, Dict, List, Optional, Tuple, Union @@ -48,7 +44,7 @@ def _load_from_state_dict( # "Please upgrade your models. Applying automatic conversion now ..." # ) for k in list(state_dict.keys()): - newk = k + newk = k if "sem_seg_head.pixel_decoder.pixel_decoder" in k: newk = k.replace("sem_seg_head.pixel_decoder.pixel_decoder", "sem_seg_head.pixel_decoder") state_dict[newk] = state_dict[k] diff --git a/part_distillation/modeling/pixel_decoder/fpn.py b/part_distillation/modeling/pixel_decoder/fpn.py index 41898bf..7df65a1 100644 --- a/part_distillation/modeling/pixel_decoder/fpn.py +++ b/part_distillation/modeling/pixel_decoder/fpn.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union diff --git a/part_distillation/modeling/pixel_decoder/msdeformattn.py b/part_distillation/modeling/pixel_decoder/msdeformattn.py index e961c3f..0ff1a81 100644 --- a/part_distillation/modeling/pixel_decoder/msdeformattn.py +++ b/part_distillation/modeling/pixel_decoder/msdeformattn.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union @@ -205,7 +201,7 @@ def __init__( self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" self.feature_strides = [v.stride for k, v in input_shape] self.feature_channels = [v.channels for k, v in input_shape] - + # this is the input shape of transformer encoder (could use less features than pixel decoder transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" @@ -254,7 +250,7 @@ def __init__( padding=0, ) weight_init.c2_xavier_fill(self.mask_features) - + self.maskformer_num_feature_levels = 3 # always use 3 scales self.common_stride = common_stride diff --git a/part_distillation/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt b/part_distillation/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt index 78a56f7..078ac85 100644 --- a/part_distillation/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt +++ b/part_distillation/modeling/pixel_decoder/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt @@ -1,7 +1,7 @@ setup.py -/private/home/janghyuncho7/PartDistllation/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp -/private/home/janghyuncho7/PartDistllation/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp -/private/home/janghyuncho7/PartDistllation/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu +/u/vcho/PartDistillation_new/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp +/u/vcho/PartDistillation_new/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp +/u/vcho/PartDistillation_new/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu MultiScaleDeformableAttention.egg-info/PKG-INFO MultiScaleDeformableAttention.egg-info/SOURCES.txt MultiScaleDeformableAttention.egg-info/dependency_links.txt diff --git a/part_distillation/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg b/part_distillation/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg index 280dd40..c129215 100644 Binary files a/part_distillation/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg and b/part_distillation/modeling/pixel_decoder/ops/dist/MultiScaleDeformableAttention-1.0-py3.9-linux-x86_64.egg differ diff --git a/part_distillation/modeling/pixel_decoder/ops/functions/__init__.py b/part_distillation/modeling/pixel_decoder/ops/functions/__init__.py index 7caea42..2b06b5a 100644 --- a/part_distillation/modeling/pixel_decoder/ops/functions/__init__.py +++ b/part_distillation/modeling/pixel_decoder/ops/functions/__init__.py @@ -6,10 +6,8 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn_func import MSDeformAttnFunction + diff --git a/part_distillation/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py b/part_distillation/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py index 3c5f2fb..b18408f 100644 --- a/part_distillation/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py +++ b/part_distillation/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py @@ -6,11 +6,8 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function diff --git a/part_distillation/modeling/pixel_decoder/ops/make.sh b/part_distillation/modeling/pixel_decoder/ops/make.sh index 5d25cba..7b38cdb 100755 --- a/part_distillation/modeling/pixel_decoder/ops/make.sh +++ b/part_distillation/modeling/pixel_decoder/ops/make.sh @@ -7,10 +7,7 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR python setup.py build install diff --git a/part_distillation/modeling/pixel_decoder/ops/modules/__init__.py b/part_distillation/modeling/pixel_decoder/ops/modules/__init__.py index 4981570..6fdbf03 100644 --- a/part_distillation/modeling/pixel_decoder/ops/modules/__init__.py +++ b/part_distillation/modeling/pixel_decoder/ops/modules/__init__.py @@ -6,10 +6,7 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn import MSDeformAttn diff --git a/part_distillation/modeling/pixel_decoder/ops/modules/ms_deform_attn.py b/part_distillation/modeling/pixel_decoder/ops/modules/ms_deform_attn.py index c4e2309..76e04e7 100644 --- a/part_distillation/modeling/pixel_decoder/ops/modules/ms_deform_attn.py +++ b/part_distillation/modeling/pixel_decoder/ops/modules/ms_deform_attn.py @@ -6,12 +6,8 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR - +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function @@ -117,13 +113,13 @@ def forward(self, query, reference_points, input_flatten, input_spatial_shapes, else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) - + try: output = MSDeformAttnFunction.apply( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) except: # CPU - # NOTE (Vincent): Work-around until we figure out the cuda path issue. + # NOTE (Vincent): Work-around until we figure out the cuda path issue. output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) # # For FLOPs calculation only # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) diff --git a/part_distillation/modeling/pixel_decoder/ops/setup.py b/part_distillation/modeling/pixel_decoder/ops/setup.py index e0b9c5a..3b57ad3 100644 --- a/part_distillation/modeling/pixel_decoder/ops/setup.py +++ b/part_distillation/modeling/pixel_decoder/ops/setup.py @@ -6,11 +6,8 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR import os import glob diff --git a/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp b/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp index b992200..48757e2 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp +++ b/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp @@ -1,17 +1,16 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include @@ -22,7 +21,7 @@ Modified from https://github.com/fundamentalvision/Deformable-DETR at::Tensor ms_deform_attn_cpu_forward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -34,7 +33,7 @@ ms_deform_attn_cpu_forward( std::vector ms_deform_attn_cpu_backward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -44,3 +43,4 @@ ms_deform_attn_cpu_backward( { AT_ERROR("Not implement on cpu"); } + diff --git a/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h b/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h index 940b3d8..51bb27e 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h +++ b/part_distillation/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h @@ -1,17 +1,16 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once @@ -19,7 +18,7 @@ Modified from https://github.com/fundamentalvision/Deformable-DETR at::Tensor ms_deform_attn_cpu_forward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -28,10 +27,12 @@ ms_deform_attn_cpu_forward( std::vector ms_deform_attn_cpu_backward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); + + diff --git a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu index 8903952..0c465da 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu +++ b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu @@ -1,17 +1,16 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. - -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include @@ -24,7 +23,7 @@ Modified from https://github.com/fundamentalvision/Deformable-DETR at::Tensor ms_deform_attn_cuda_forward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -56,7 +55,7 @@ at::Tensor ms_deform_attn_cuda_forward( const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); - + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; @@ -87,7 +86,7 @@ at::Tensor ms_deform_attn_cuda_forward( std::vector ms_deform_attn_cuda_backward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -133,7 +132,7 @@ std::vector ms_deform_attn_cuda_backward( auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); - + for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); @@ -156,4 +155,4 @@ std::vector ms_deform_attn_cuda_backward( return { grad_value, grad_sampling_loc, grad_attn_weight }; -} +} \ No newline at end of file diff --git a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h index 9ef49e6..4f0658e 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h +++ b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h @@ -1,24 +1,23 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -26,10 +25,11 @@ at::Tensor ms_deform_attn_cuda_forward( const int im2col_step); std::vector ms_deform_attn_cuda_backward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); + diff --git a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh index e4f21e8..c04e0d4 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh +++ b/part_distillation/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh @@ -1,17 +1,17 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. - -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************** +* Modified from DCN (https://github.com/msracver/Deformable-ConvNets) +* Copyright (c) 2018 Microsoft +************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include @@ -36,7 +36,7 @@ inline int GET_BLOCKS(const int N, const int num_threads) template -__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { @@ -90,12 +90,12 @@ __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, template -__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, - scalar_t* &grad_value, + scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { @@ -145,7 +145,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; - atomicAdd(grad_value+ptr3, w3*top_grad_value); + atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) @@ -165,12 +165,12 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, template -__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, - scalar_t* &grad_value, + scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { @@ -220,7 +220,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; - atomicAdd(grad_value+ptr3, w3*top_grad_value); + atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) @@ -233,7 +233,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } @@ -241,15 +241,15 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, template __global__ void ms_deformable_im2col_gpu_kernel(const int n, - const scalar_t *data_value, + const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -260,7 +260,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n, int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -273,7 +273,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n, const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; - + for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; @@ -308,13 +308,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -330,7 +330,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -374,10 +374,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } - + __syncthreads(); if (tid == 0) { @@ -390,8 +390,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co _grad_a += cache_grad_attn_weight[tid]; sid += 2; } - - + + *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; @@ -413,13 +413,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -435,7 +435,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -479,10 +479,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } - + __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) @@ -498,7 +498,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co } if (tid == 0) - { + { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; @@ -520,13 +520,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -543,7 +543,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -587,10 +587,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } - + __syncthreads(); if (tid == 0) { @@ -603,8 +603,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, _grad_a += cache_grad_attn_weight[tid]; sid += 2; } - - + + *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; @@ -625,13 +625,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -648,7 +648,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -692,10 +692,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } - + __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) @@ -711,7 +711,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; - } + } } __syncthreads(); } @@ -738,13 +738,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -761,7 +761,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -805,10 +805,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } - + __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) @@ -852,13 +852,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, + const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, const int num_point, @@ -871,7 +871,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, int _temp = index; const int c_col = _temp % channels; _temp /= channels; - const int sampling_index = _temp; + const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; @@ -912,7 +912,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, - top_grad, weight, grad_value_ptr, + top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; @@ -928,15 +928,15 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, - const int64_t* data_spatial_shapes, - const int64_t* data_level_start_index, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, - const int spatial_size, - const int num_heads, - const int channels, - const int num_levels, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, const int num_query, const int num_point, scalar_t* data_col) @@ -947,9 +947,9 @@ void ms_deformable_im2col_cuda(cudaStream_t stream, ms_deformable_im2col_gpu_kernel <<>>( - num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); - + cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -966,13 +966,13 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, - const int batch_size, - const int spatial_size, + const int batch_size, + const int spatial_size, const int num_heads, - const int channels, + const int channels, const int num_levels, const int num_query, - const int num_point, + const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) @@ -987,17 +987,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1010,17 +1010,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_gm <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1036,17 +1036,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1058,17 +1058,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1080,17 +1080,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1102,17 +1102,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1124,17 +1124,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1146,17 +1146,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1168,17 +1168,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1190,17 +1190,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1212,17 +1212,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1234,17 +1234,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1256,17 +1256,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1280,17 +1280,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1303,17 +1303,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( - num_kernels, + num_kernels, grad_col, data_value, data_spatial_shapes, - data_level_start_index, + data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, - spatial_size, + batch_size, + spatial_size, num_heads, - channels, + channels, num_levels, num_query, num_point, @@ -1329,4 +1329,4 @@ void ms_deformable_col2im_cuda(cudaStream_t stream, printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } -} +} \ No newline at end of file diff --git a/part_distillation/modeling/pixel_decoder/ops/src/ms_deform_attn.h b/part_distillation/modeling/pixel_decoder/ops/src/ms_deform_attn.h index 1ceb4ae..2f80a1b 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/ms_deform_attn.h +++ b/part_distillation/modeling/pixel_decoder/ops/src/ms_deform_attn.h @@ -1,17 +1,16 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once @@ -25,7 +24,7 @@ Modified from https://github.com/fundamentalvision/Deformable-DETR at::Tensor ms_deform_attn_forward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -46,7 +45,7 @@ ms_deform_attn_forward( std::vector ms_deform_attn_backward( - const at::Tensor &value, + const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, @@ -65,3 +64,4 @@ ms_deform_attn_backward( } AT_ERROR("Not implemented on the CPU"); } + diff --git a/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp b/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp index 72c89ea..4a08821 100644 --- a/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp +++ b/part_distillation/modeling/pixel_decoder/ops/src/vision.cpp @@ -1,17 +1,16 @@ /*! ------------------------------------------------------------------------------------------------- -Deformable DETR -Copyright (c) 2020 SenseTime. All Rights Reserved. -Licensed under the Apache License, Version 2.0 [see LICENSE for details] ------------------------------------------------------------------------------------------------- -Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ------------------------------------------------------------------------------------------------- - -Copyright (c) Meta Platforms, Inc. and affiliates. +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ -This source code is licensed under the MIT license found in the -LICENSE file in the root directory of this source tree. -Modified from https://github.com/fundamentalvision/Deformable-DETR +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include "ms_deform_attn.h" diff --git a/part_distillation/modeling/pixel_decoder/ops/test.py b/part_distillation/modeling/pixel_decoder/ops/test.py index 94446e5..6e1b545 100644 --- a/part_distillation/modeling/pixel_decoder/ops/test.py +++ b/part_distillation/modeling/pixel_decoder/ops/test.py @@ -6,11 +6,8 @@ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# Modified by from https://github.com/fundamentalvision/Deformable-DETR +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function @@ -90,3 +87,6 @@ def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) + + + diff --git a/part_distillation/modeling/transformer_decoder/__init__.py b/part_distillation/modeling/transformer_decoder/__init__.py index edc10db..b9df8f7 100644 --- a/part_distillation/modeling/transformer_decoder/__init__.py +++ b/part_distillation/modeling/transformer_decoder/__init__.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + from .maskformer_transformer_decoder import StandardTransformerDecoder from .mask2former_transformer_decoder import MultiScaleMaskedTransformerDecoder from .part_distillation_transformer_decoder import PartDistillationTransformerDecoder diff --git a/part_distillation/modeling/transformer_decoder/mask2former_transformer_decoder.py b/part_distillation/modeling/transformer_decoder/mask2former_transformer_decoder.py index 04a5abe..939c402 100644 --- a/part_distillation/modeling/transformer_decoder/mask2former_transformer_decoder.py +++ b/part_distillation/modeling/transformer_decoder/mask2former_transformer_decoder.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import logging import fvcore.nn.weight_init as weight_init @@ -16,12 +12,14 @@ from .position_encoding import PositionEmbeddingSine from .maskformer_transformer_decoder import TRANSFORMER_DECODER_REGISTRY +from fairscale.nn.checkpoint import checkpoint_wrapper class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, - activation="relu", normalize_before=False): + activation="relu", normalize_before=False, + fp16=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) @@ -30,9 +28,9 @@ def __init__(self, d_model, nhead, dropout=0.0, self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before - + self.fp16 = fp16 self._reset_parameters() - + def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: @@ -62,7 +60,7 @@ def forward_pre(self, tgt, tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) - + return tgt def forward(self, tgt, @@ -90,7 +88,7 @@ def __init__(self, d_model, nhead, dropout=0.0, self.normalize_before = normalize_before self._reset_parameters() - + def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: @@ -110,7 +108,7 @@ def forward_post(self, tgt, memory, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) - + return tgt def forward_pre(self, tgt, memory, @@ -155,7 +153,7 @@ def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, self.normalize_before = normalize_before self._reset_parameters() - + def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: @@ -252,6 +250,7 @@ def __init__( mask_dim: int, enforce_input_project: bool, query_feature_normalize: bool, + use_checkpoint: bool=False, ): """ NOTE: this interface is experimental. @@ -278,7 +277,7 @@ def __init__( # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) - + # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers @@ -340,12 +339,22 @@ def __init__( self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) self.query_feature_normalize = query_feature_normalize + if use_checkpoint: + for layer in self.transformer_self_attention_layers: + layer = checkpoint_wrapper(layer) + + for layer in self.transformer_cross_attention_layers: + layer = checkpoint_wrapper(layer) + + for layer in self.transformer_ffn_layers: + layer = checkpoint_wrapper(layer) + @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification - + ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES @@ -364,6 +373,7 @@ def from_config(cls, cfg, in_channels, mask_classification): ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ ret["query_feature_normalize"] = cfg.MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + ret["use_checkpoint"] = cfg.USE_CHECKPOINT return ret @@ -416,7 +426,7 @@ def forward(self, x, mask_features, mask = None): tgt_key_padding_mask=None, query_pos=query_embed ) - + # FFN output = self.transformer_ffn_layers[i]( output diff --git a/part_distillation/modeling/transformer_decoder/maskformer_transformer_decoder.py b/part_distillation/modeling/transformer_decoder/maskformer_transformer_decoder.py index b738f56..79f09fa 100644 --- a/part_distillation/modeling/transformer_decoder/maskformer_transformer_decoder.py +++ b/part_distillation/modeling/transformer_decoder/maskformer_transformer_decoder.py @@ -1,7 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py import fvcore.nn.weight_init as weight_init import torch diff --git a/part_distillation/modeling/transformer_decoder/part_distillation_transformer_decoder.py b/part_distillation/modeling/transformer_decoder/part_distillation_transformer_decoder.py index 13e4ba0..4f38558 100644 --- a/part_distillation/modeling/transformer_decoder/part_distillation_transformer_decoder.py +++ b/part_distillation/modeling/transformer_decoder/part_distillation_transformer_decoder.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import logging import fvcore.nn.weight_init as weight_init from typing import Optional @@ -25,89 +26,14 @@ def __init__( self, in_channels, mask_classification, - # *, - # num_classes: int, - # hidden_dim: int, - # num_queries: int, - # nheads: int, - # dim_feedforward: int, - # dec_layers: int, - # pre_norm: bool, - # mask_dim: int, *args, num_object_classes: int, num_part_classes: int, **kwargs, ): super().__init__(in_channels, mask_classification, *args, **kwargs) - # assert mask_classification, "Only support mask classification model" - # self.mask_classification = mask_classification - - # # positional encoding - # N_steps = hidden_dim // 2 - # self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) - - # # define Transformer decoder here - # self.num_heads = nheads - # self.num_layers = dec_layers - # self.transformer_self_attention_layers = nn.ModuleList() - # self.transformer_cross_attention_layers = nn.ModuleList() - # self.transformer_ffn_layers = nn.ModuleList() - - # for _ in range(self.num_layers): - # self.transformer_self_attention_layers.append( - # SelfAttentionLayer( - # d_model=hidden_dim, - # nhead=nheads, - # dropout=0.0, - # normalize_before=pre_norm, - # ) - # ) - - # self.transformer_cross_attention_layers.append( - # CrossAttentionLayer( - # d_model=hidden_dim, - # nhead=nheads, - # dropout=0.0, - # normalize_before=pre_norm, - # ) - # ) - - # self.transformer_ffn_layers.append( - # FFNLayer( - # d_model=hidden_dim, - # dim_feedforward=dim_feedforward, - # dropout=0.0, - # normalize_before=pre_norm, - # ) - # ) - - # self.decoder_norm = nn.LayerNorm(hidden_dim) - - # self.num_queries = num_queries - # # learnable query features - # self.query_feat = nn.Embedding(num_queries, hidden_dim) - # # learnable query p.e. - # self.query_embed = nn.Embedding(num_queries, hidden_dim) - - # # level embedding (we always use 3 scales) - # self.num_feature_levels = 3 - # self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) - # self.input_proj = nn.ModuleList() - # for _ in range(self.num_feature_levels): - # if in_channels != hidden_dim or enforce_input_project: - # self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) - # weight_init.c2_xavier_fill(self.input_proj[-1]) - # else: - # self.input_proj.append(nn.Sequential()) - - # output FFNs - # if self.mask_classification: - # self.class_embed = nn.Linear(self.hidden_dim, num_part_classes * num_object_classes + 1).double() self.class_embed = nn.Linear(self.hidden_dim, num_part_classes * num_object_classes + 1).double() - # self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) - # self.query_feature_normalize = query_feature_normalize - self.num_part_classes = num_part_classes + self.num_part_classes = num_part_classes @classmethod @@ -115,7 +41,7 @@ def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification - + ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES @@ -144,9 +70,9 @@ def forward(self, x, mask_features, mask = None): src = [] pos = [] size_list = [] - + # NOTE: We abuse [mask] argument, but it works for now - targets = mask + targets = mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) @@ -167,7 +93,7 @@ def forward(self, x, mask_features, mask = None): predictions_mask = [] # prediction heads on learnable query features - outputs_class, outputs_mask, attn_mask, _ = self.forward_prediction_heads(output, mask_features, targets, + outputs_class, outputs_mask, attn_mask, _ = self.forward_prediction_heads(output, mask_features, targets, attn_mask_target_size=size_list[0]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) @@ -188,20 +114,20 @@ def forward(self, x, mask_features, mask = None): tgt_key_padding_mask=None, query_pos=query_embed ) - + # FFN output = self.transformer_ffn_layers[i]( output ) - outputs_class, outputs_mask, attn_mask, decoder_output = self.forward_prediction_heads(output, mask_features, targets, + outputs_class, outputs_mask, attn_mask, decoder_output = self.forward_prediction_heads(output, mask_features, targets, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) out = { - 'query_feats': output.permute(1, 0, 2), + 'query_feats': output.permute(1, 0, 2), 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( @@ -211,24 +137,24 @@ def forward(self, x, mask_features, mask = None): } return out - + def apply_gradient_mask(self, outputs, targets): - # outputs: BxQxC + # outputs: BxQxC new_outputs = [] for i, target_per_image in enumerate(targets): start_idx = target_per_image["gt_object_class"] * self.num_part_classes end_idx = (target_per_image["gt_object_class"] + 1) * self.num_part_classes new_outputs.append(outputs[i][:, start_idx:end_idx]) - + new_outputs = torch.stack(new_outputs, dim=0) new_outputs = torch.cat([new_outputs, outputs[:, :, -1:]], dim=-1) - + # NOTE: Ugly trick to make pytorch optimizer happy ... - new_outputs = new_outputs + (outputs.sum() * 0) + new_outputs = new_outputs + (outputs.sum() * 0) return new_outputs - + def forward_prediction_heads(self, output, mask_features, targets, attn_mask_target_size): decoder_output = self.decoder_norm(output) @@ -252,3 +178,4 @@ def forward_prediction_heads(self, output, mask_features, targets, attn_mask_tar attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask, decoder_output + diff --git a/part_distillation/modeling/transformer_decoder/position_encoding.py b/part_distillation/modeling/transformer_decoder/position_encoding.py index cff4c65..e8c5062 100644 --- a/part_distillation/modeling/transformer_decoder/position_encoding.py +++ b/part_distillation/modeling/transformer_decoder/position_encoding.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. @@ -54,7 +50,7 @@ def forward(self, x, mask=None): ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos - + def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ diff --git a/part_distillation/modeling/transformer_decoder/transformer.py b/part_distillation/modeling/transformer_decoder/transformer.py index 653b946..ea8caa0 100644 --- a/part_distillation/modeling/transformer_decoder/transformer.py +++ b/part_distillation/modeling/transformer_decoder/transformer.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py """ Transformer class. diff --git a/part_distillation/part_distillation_model.py b/part_distillation/part_distillation_model.py index ee0d955..3149b75 100644 --- a/part_distillation/part_distillation_model.py +++ b/part_distillation/part_distillation_model.py @@ -4,12 +4,12 @@ # LICENSE file in the root directory of this source tree. -import os +import os import torch import logging -import numpy as np +import numpy as np import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F @@ -59,7 +59,7 @@ def __init__( apply_masking_with_object_mask: bool=True, ): super().__init__() - self.mode = "" + self.mode = "" self.backbone = backbone self.sem_seg_head = sem_seg_head @@ -76,6 +76,7 @@ def __init__( self.metadata = MetadataCatalog.get(train_dataset_name) self.train_dataset_name = train_dataset_name self.num_classes = num_classes + self.logger = logging.getLogger("part_distillation") # wandb self.use_wandb = use_wandb @@ -83,14 +84,14 @@ def __init__( self.wandb_vis_period_test = wandb_vis_period_test self.cpu_device = torch.device("cpu") self.wandb_vis_topk = wandb_vis_topk - + # postprocessing self.use_unique_per_pixel_label = use_unique_per_pixel_label self.min_pseudo_mask_score = min_pseudo_mask_score self.min_pseudo_mask_ratio = min_pseudo_mask_ratio self.majority_vote_mapping = {} - self.current_train_iteration = 0 - self.current_test_iteration = 0 + self.current_train_iteration = 0 + self.current_test_iteration = 0 self.use_oracle_classifier = use_oracle_classifier self.apply_masking_with_object_mask = apply_masking_with_object_mask @@ -100,13 +101,13 @@ def __init__( if comm.is_main_process(): if not os.path.exists(self.root_save_path): os.makedirs(self.root_save_path) - + for fname in self.metadata.class_codes: folder_path = os.path.join(self.root_save_path, fname) if not os.path.exists(folder_path): os.makedirs(folder_path) - - + + @classmethod def from_config(cls, cfg): @@ -133,7 +134,7 @@ def from_config(cls, cfg): weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight} if deep_supervision: - dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS + dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) @@ -161,7 +162,7 @@ def from_config(cls, cfg): "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, - # wandb + # wandb "wandb_vis_period_train": cfg.WANDB.VIS_PERIOD_TRAIN, "wandb_vis_period_test": cfg.WANDB.VIS_PERIOD_TEST, "wandb_vis_topk": cfg.WANDB.VIS_TOPK, @@ -172,11 +173,11 @@ def from_config(cls, cfg): "train_dataset_name": cfg.DATASETS.TRAIN[0], "num_classes": num_classes, "min_pseudo_mask_ratio": cfg.PART_DISTILLATION.MIN_AREA_RATIO, - "min_pseudo_mask_score": cfg.PART_DISTILLATION.MIN_SCORE, + "min_pseudo_mask_score": cfg.PART_DISTILLATION.MIN_SCORE, "use_oracle_classifier": cfg.PART_DISTILLATION.USE_ORACLE_CLASSIFIER, "apply_masking_with_object_mask": cfg.PART_DISTILLATION.APPLY_MASKING_WITH_OBJECT_MASK, } - + @property def device(self): @@ -186,14 +187,14 @@ def device(self): def register_metadata(self, dataset_name): self.logger.info("{} is registered for evaluation.".format(dataset_name)) self.metadata = MetadataCatalog.get(dataset_name) - + def update_majority_vote_mapping(self, mapping_dict): self.logger.info("Updating class mapping based on majrotiy vote.") for cid, mapping in mapping_dict.items(): self.majority_vote_mapping[cid] = mapping.to(self.device) - + def forward(self, batched_inputs): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] @@ -202,7 +203,7 @@ def forward(self, batched_inputs): features = self.backbone(images.tensor) targets = self.prepare_targets(batched_inputs, images) # NOTE: abusing the "mask" argument but works for now - outputs = self.sem_seg_head(features, mask=targets) + outputs = self.sem_seg_head(features, mask=targets) if self.training: # bipartite matching-based loss @@ -214,15 +215,15 @@ def forward(self, batched_inputs): else: # remove this loss if not specified in `weight_dict` losses.pop(k) - + if self.use_wandb and comm.is_main_process(): if self.current_train_iteration % self.wandb_vis_period_train == 0: with torch.no_grad(): processed_results_vis = self.inference(batched_inputs, targets, images, outputs, vis=True) self.wandb_visualize(batched_inputs, images, processed_results_vis) del processed_results_vis - - self.current_train_iteration += 1 + + self.current_train_iteration += 1 return losses else: processed_results = self.inference(batched_inputs, targets, images, outputs, vis=False) @@ -232,7 +233,7 @@ def forward(self, batched_inputs): self.wandb_visualize(batched_inputs, images, processed_results_vis) del processed_results_vis - self.current_test_iteration += 1 + self.current_test_iteration += 1 return processed_results @@ -253,8 +254,8 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): mask_cls_results, mask_pred_results, targets, batched_inputs, images.image_sizes )): # NOTE: Unlike standard pipeline, we provide gt label as input for inference. - # This reshapes the labels to input size already, so we want to reshape - # both gts and predictions to the original image size. + # This reshapes the labels to input size already, so we want to reshape + # both gts and predictions to the original image size. height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) @@ -262,7 +263,7 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): target_mask = retry_if_cuda_oom(sem_seg_postprocess)(target["masks"].float(), image_size, height, width).bool() target_object_mask = retry_if_cuda_oom(sem_seg_postprocess)(target["object_mask"].float(), image_size, height, width).bool() mask_cls_result = mask_cls_result.to(mask_pred_result) - + processed_results.append({}) instance_r = self.instance_inference_with_classification(mask_cls_result, mask_pred_result, target_mask, \ @@ -270,23 +271,23 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): if self.mode == "save" and not vis: self.save_part_segmentation(input_per_image, instance_r) - processed_results[-1]["predictions"] = instance_r - + processed_results[-1]["predictions"] = instance_r + target_inst = Instances(target_mask.shape[-2:]) target_inst.gt_masks = target_mask - target_inst.gt_classes = target["labels"] - + target_inst.gt_classes = target["labels"] + # For visualization - target_inst.pred_masks = target_mask - target_inst.pred_classes = target["labels"] - - # For evaluation + target_inst.pred_masks = target_mask + target_inst.pred_classes = target["labels"] + + # For evaluation processed_results[-1]["gt_instances"] = target_inst processed_results[-1]["gt_object_label"] = target["gt_object_class"] return processed_results - + def save_part_segmentation(self, input_per_image, instance): if instance is not None: H, W = instance.pred_masks.shape[1:] @@ -297,26 +298,26 @@ def save_part_segmentation(self, input_per_image, instance): res = {"file_name": input_per_image["file_name"], "image_id": input_per_image["image_id"], "class_code": input_per_image["class_code"], - "height": H, - "width": W, + "height": H, + "width": W, "part_masks": proposals_to_coco_json(instance.pred_masks.cpu()), - "part_labels": instance.pred_classes.cpu(), + "part_labels": instance.pred_classes.cpu(), "part_area_ratios":part_areas / object_area, "object_ratio": object_area / image_area, "part_scores": instance.scores.cpu().numpy()} torch.save(res, os.path.join(self.root_save_path, input_per_image["class_code"], input_per_image["image_id"])) - del res - del instance - del input_per_image - - + del res + del instance + del input_per_image + + def masking_with_object_mask(self, masks_per_image, target_masks): if self.apply_masking_with_object_mask: - object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() + object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() - return masks_per_image * object_target_mask + return masks_per_image * object_target_mask else: return masks_per_image @@ -327,7 +328,7 @@ def match_gt_labels(self, masks_per_image, scores_per_image, prop_feats_per_imag pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > self.fg_score_threshold).flatten() @@ -343,7 +344,7 @@ def match_gt_labels(self, masks_per_image, scores_per_image, prop_feats_per_imag def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, class_labels): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label: - # segmentation + # segmentation predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() scoremap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = scoremap_per_image.unique() @@ -361,19 +362,19 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla newmasks_per_image[i] = segmasks_per_image[class_labels == cid].sum(dim=0).bool() newscore_per_image[i] = scores_per_image[class_labels == cid].topk(1, dim=0)[0].flatten() - # filter + # filter loc_valid_idxs = newmasks_per_image.flatten(1).sum(dim=1) / obj_map_per_image.flatten(1).sum(dim=1) > self.min_pseudo_mask_ratio if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] newscore_per_image = newscore_per_image[loc_valid_idxs] new_class_labels = new_class_labels[loc_valid_idxs] - + loc_valid_idxs = newscore_per_image > self.min_pseudo_mask_score if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] newscore_per_image = newscore_per_image[loc_valid_idxs] new_class_labels = new_class_labels[loc_valid_idxs] - + return newmasks_per_image.bool(), newscore_per_image, new_class_labels else: # filter @@ -383,7 +384,7 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla masks_per_image = predmask_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] class_labels = class_labels[loc_valid_idxs] - + loc_valid_idxs = scores_per_image > self.min_pseudo_mask_score if loc_valid_idxs.any(): masks_per_image = masks_per_image[loc_valid_idxs] @@ -404,54 +405,53 @@ def prepare_targets(self, inputs, images): def _prepare_pseudo_targets(self, inputs, images): """ - return: Instance with gt_masks field. + return: Instance with gt_masks field. """ pseudo_targets = [x["instances"].to(self.device) for x in inputs] h_pad, w_pad = images.tensor.shape[-2:] # NOTE: Assume same size for all images ? new_targets = [] for idx, pseudo_targets_per_image in enumerate(pseudo_targets): gt_psuedo_masks = pseudo_targets_per_image.gt_masks.tensor - padded_pseudo_masks = torch.zeros((gt_psuedo_masks.shape[0], h_pad, w_pad), + padded_pseudo_masks = torch.zeros((gt_psuedo_masks.shape[0], h_pad, w_pad), dtype=gt_psuedo_masks.dtype, device=gt_psuedo_masks.device) padded_pseudo_masks[:, : gt_psuedo_masks.shape[1], : gt_psuedo_masks.shape[2]] = gt_psuedo_masks n = padded_pseudo_masks.shape[0] gt_labels = pseudo_targets_per_image.gt_classes.to(self.device) gt_object_class = inputs[idx]["gt_object_class"] - new_targets.append({"labels": gt_labels, - "masks": padded_pseudo_masks, + "masks": padded_pseudo_masks, "object_mask": padded_pseudo_masks.sum(dim=0, keepdim=True), - "gt_object_class": gt_object_class, + "gt_object_class": gt_object_class, }) - + return new_targets - + def _prepare_gt_targets(self, inputs, images): targets = [x["part_instances"].to(self.device) for x in inputs] object_targets = [x["instances"].to(self.device) for x in inputs] - - h_pad, w_pad = images.tensor.shape[-2:] + + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for input_per_image, object_targets_per_image, targets_per_image in zip(inputs, object_targets, targets): gt_mask = targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask - gt_obj_masks = object_targets_per_image.gt_masks.tensor - padded_obj_masks = torch.zeros((gt_obj_masks.shape[0], h_pad, w_pad), + gt_obj_masks = object_targets_per_image.gt_masks.tensor + padded_obj_masks = torch.zeros((gt_obj_masks.shape[0], h_pad, w_pad), dtype=gt_obj_masks.dtype, device=gt_obj_masks.device) padded_obj_masks[:, : gt_obj_masks.shape[1], : gt_obj_masks.shape[2]] = gt_obj_masks - - new_targets.append({"labels": targets_per_image.gt_classes.to(self.device), + + new_targets.append({"labels": targets_per_image.gt_classes.to(self.device), "masks": padded_masks, "object_mask": padded_obj_masks, - "gt_object_class": object_targets_per_image.gt_classes.to(self.device), + "gt_object_class": object_targets_per_image.gt_classes.to(self.device), }) - + return new_targets @@ -475,15 +475,15 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas mask_pred = mask_pred[topk_indices] mask_pred = self.masking_with_object_mask(mask_pred, target_object_mask) - # unique mapping and merging + # unique mapping and merging mask_pred_bool, scores_per_image, labels_per_image = self._unique_assignment_with_classes(mask_pred, scores_per_image, labels_per_image) mask_pred_bool, scores_per_image, labels_per_image, gt_part_labels = \ self.match_gt_labels(mask_pred_bool, scores_per_image, labels_per_image, target_mask, target_labels) if mask_pred_bool.shape[0] == 0: - # Doesn't contribute to the evaluation. + # Doesn't contribute to the evaluation. mask_pred_bool = mask_pred.new_zeros(1, *mask_pred.shape[1:]).bool() - scores_per_image = scores_per_image.new_zeros(1) + scores_per_image = scores_per_image.new_zeros(1) labels_per_image = scores_per_image.new_ones(1).long() * self.num_classes gt_part_labels = scores_per_image.new_ones(1).long() * self.num_classes @@ -492,11 +492,11 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas pred_masks_float = result.pred_masks.float() result.scores = scores_per_image - + if self.use_oracle_classifier: - result.pred_classes = gt_part_labels + result.pred_classes = gt_part_labels else: - result.pred_classes = labels_per_image + result.pred_classes = labels_per_image return result @@ -506,7 +506,7 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["predictions"].to(self.cpu_device) for r in processed_results] @@ -514,7 +514,7 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) visualizer = Partvisualizer(image, self.metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=instances) @@ -524,6 +524,6 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): visualizer = Partvisualizer(image, self.metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) diff --git a/part_distillation/part_ranking_model.py b/part_distillation/part_ranking_model.py index 970a84d..fca2ebc 100644 --- a/part_distillation/part_ranking_model.py +++ b/part_distillation/part_ranking_model.py @@ -3,12 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os + +import os import torch import logging -import numpy as np +import numpy as np import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F @@ -54,6 +55,9 @@ def __init__( classifier_metric: str="l2", dataset_name: str="", proposal_features_norm: bool=True, + root_folder_name: str="pseudo_labels", + weight_name: str="default", + save_annotations: bool=False, debug: bool=False, ): super().__init__() @@ -67,16 +71,16 @@ def __init__( self.size_divisibility = size_divisibility self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) - + # wandb self.use_wandb = use_wandb - self.num_iters = 0 + self.num_iters = 0 self.wandb_vis_period = wandb_vis_period self.cpu_device = torch.device("cpu") self.wandb_vis_topk = wandb_vis_topk # postprocess - self.mode = "" + self.mode = "" self.test_topk_per_image = test_topk_per_image self.proposal_features_norm = proposal_features_norm self.proposal_key = proposal_key @@ -94,20 +98,29 @@ def __init__( self.majority_vote_mapping = {} # setup save dir - dataset_name_dir = dataset_name.replace("_pre_labeling", "")if not debug else "debug" - self.root_save_path = "pseudo_labels/part_labels/part_masks_with_class/{}/{}_{}/"\ - .format(dataset_name_dir, classifier_metric, num_clusters) - self.metadata = MetadataCatalog.get(dataset_name) - if comm.is_main_process(): - if not os.path.exists(self.root_save_path): - os.makedirs(self.root_save_path) - - for fname in self.metadata.class_codes: - folder_path = os.path.join(self.root_save_path, fname) - if not os.path.exists(folder_path): - os.makedirs(folder_path) - - + if save_annotations: + dataset_name_dir = dataset_name.replace("_pre_labeling", "")if not debug else "debug" + self.root_save_path = "{}/part_labels/part_masks_with_class/{}/{}/{}_{}/r1_{}_s1_{}_r2_{}_s2_{}/"\ + .format( + root_folder_name, + dataset_name_dir, + weight_name, + classifier_metric, + num_clusters, + min_pseudo_mask_ratio_1, + min_pseudo_mask_score_1, + min_pseudo_mask_ratio_2, + min_pseudo_mask_score_2) + self.metadata = MetadataCatalog.get(dataset_name) + if comm.is_main_process(): + if not os.path.exists(self.root_save_path): + os.makedirs(self.root_save_path) + + for fname in self.metadata.class_codes: + folder_path = os.path.join(self.root_save_path, fname) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + @classmethod def from_config(cls, cfg): @@ -121,7 +134,7 @@ def from_config(cls, cfg): "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, - # wandb + # wandb "wandb_vis_period": cfg.WANDB.VIS_PERIOD_TEST, "wandb_vis_topk": cfg.WANDB.VIS_TOPK, "use_wandb": not cfg.WANDB.DISABLE_WANDB, @@ -135,13 +148,16 @@ def from_config(cls, cfg): "dataset_name": cfg.DATASETS.TEST[0], "num_clusters": cfg.PART_RANKING.NUM_CLUSTERS, "proposal_features_norm": cfg.PART_RANKING.PROPOSAL_FEATURE_NORM, - "min_pseudo_mask_ratio_1": cfg.PART_RANKING.MIN_AREA_RATIO_1, + "min_pseudo_mask_ratio_1": cfg.PART_RANKING.MIN_AREA_RATIO_1, "min_pseudo_mask_score_1": cfg.PART_RANKING.MIN_SCORE_1, - "min_pseudo_mask_ratio_2": cfg.PART_RANKING.MIN_AREA_RATIO_2, + "min_pseudo_mask_ratio_2": cfg.PART_RANKING.MIN_AREA_RATIO_2, "min_pseudo_mask_score_2": cfg.PART_RANKING.MIN_SCORE_2, + "root_folder_name": cfg.PART_RANKING.ROOT_FOLDER_NAME, + "weight_name": cfg.PART_RANKING.WEIGHT_NAME, + "save_annotations": cfg.PART_RANKING.SAVE_ANNOTATIONS, "debug": cfg.PART_RANKING.DEBUG, } - + def num_classes(self, k): return self.classifier[k.item()].weight.data.shape[0] @@ -150,7 +166,7 @@ def num_classes(self, k): def register_metadata(self, dataset_name): self.logger.info("{} is registered for evaluation.".format(dataset_name)) self.metadata = MetadataCatalog.get(dataset_name) - + def update_majority_vote_mapping(self, mapping_dict): self.logger.info("Updating class mapping based on majrotiy vote.") @@ -162,7 +178,7 @@ def update_majority_vote_mapping(self, mapping_dict): def device(self): return self.pixel_mean.device - + def forward(self, batched_inputs): assert not self.training, "part ranking is eval-only." images = [x["image"].to(self.device) for x in batched_inputs] @@ -203,8 +219,8 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): mask_cls_results, mask_pred_results, proposal_feats, targets, batched_inputs, images.image_sizes )): # NOTE: Unlike standard pipeline, we provide gt label as input for inference. - # This reshapes the labels to input size already, so we want to reshape - # both gts and predictions to the original image size. + # This reshapes the labels to input size already, so we want to reshape + # both gts and predictions to the original image size. height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(mask_pred_result, image_size, height, width) @@ -215,44 +231,44 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): if self.mode == "cluster": masks_per_image, scores_per_image, proposal_feats_per_image = \ - self.instance_inference_with_proposal_feats(proposal_feats_per_image, - mask_cls_result, - mask_pred_result, - target_mask, - target_object_mask, + self.instance_inference_with_proposal_feats(proposal_feats_per_image, + mask_cls_result, + mask_pred_result, + target_mask, + target_object_mask, vis=vis) masks_per_image, scores_per_image, proposal_feats_per_image = \ - self.match_gt_masks(masks_per_image, - scores_per_image, - proposal_feats_per_image, + self.match_gt_masks(masks_per_image, + scores_per_image, + proposal_feats_per_image, target_mask) result = Instances(image_size) result.pred_masks = masks_per_image.bool() result.scores = scores_per_image - processed_results[-1]["predictions"] = result + processed_results[-1]["predictions"] = result processed_results[-1]["proposal_features"] = proposal_feats_per_image else: - instance_r = self.instance_inference_with_classification(proposal_feats_per_image, - mask_cls_result, - mask_pred_result, - target_mask, - target_object_mask, - target["object_label"], + instance_r = self.instance_inference_with_classification(proposal_feats_per_image, + mask_cls_result, + mask_pred_result, + target_mask, + target_object_mask, + target["object_label"], vis=vis) processed_results[-1]["predictions"] = instance_r if not vis and self.mode == "save": self.save_generated_part_labels(input_per_image, target["object_label"], instance_r) - + target_inst = Instances(target_mask.shape[-2:]) target_inst.gt_masks = target_mask target_inst.pred_masks = target_mask # for visualization if "part_labels" in target: target_inst.gt_classes = target["part_labels"] processed_results[-1]["gt_instances"] = target_inst - processed_results[-1]["gt_object_label"] = target["object_label"] + processed_results[-1]["gt_object_label"] = target["object_label"] processed_results[-1]["gt_label"] = torch.tensor([target["object_label"] for _ in range(len(proposal_feats_per_image))]) - + return processed_results @@ -263,33 +279,33 @@ def save_generated_part_labels(self, input_per_image, label, instance): res = {"file_name": input_per_image["file_name"], "image_id": input_per_image["image_id"], "class_code": input_per_image["class_code"], - "height": H, - "width": W, + "height": H, + "width": W, "part_masks": proposals_to_coco_json(instance.pred_masks.cpu()), - "part_labels": instance.pred_classes.cpu(), + "part_labels": instance.pred_classes.cpu(), "object_ratio": instance.pred_masks.cpu().sum().long().item() / (H*W), "part_ratios": instance.pred_masks.cpu().flatten(1).sum(-1) / (H*W), - "object_class_label": label.item(), + "object_class_label": label.item(), "part_scores": instance.scores.cpu().numpy()} torch.save(res, os.path.join(self.root_save_path, input_per_image["class_code"], input_per_image["image_id"])) - + def masking_with_object_mask(self, masks_per_image, target_masks): if self.apply_masking_with_object_mask: - object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() + object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() - return masks_per_image * object_target_mask + return masks_per_image * object_target_mask else: return masks_per_image - + def match_gt_masks(self, masks_per_image, scores_per_image, prop_feats_per_image, target_masks): pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > self.fg_score_threshold).flatten() @@ -300,11 +316,11 @@ def match_gt_masks(self, masks_per_image, scores_per_image, prop_feats_per_image return masks_per_image, scores_per_image, prop_feats_per_image - + def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, class_labels): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label_during_labeling: - # segmentation + # segmentation predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() scoremap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = scoremap_per_image.unique() @@ -322,19 +338,19 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla newmasks_per_image[i] = segmasks_per_image[class_labels == cid].sum(dim=0).bool() newscore_per_image[i] = scores_per_image[class_labels == cid].topk(1, dim=0)[0].flatten() - # filter + # filter loc_valid_idxs = newmasks_per_image.flatten(1).sum(dim=1) / obj_map_per_image.flatten(1).sum(dim=1) > self.min_pseudo_mask_ratio_2 if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] newscore_per_image = newscore_per_image[loc_valid_idxs] new_class_labels = new_class_labels[loc_valid_idxs] - + loc_valid_idxs = newscore_per_image > self.min_pseudo_mask_score_2 if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] newscore_per_image = newscore_per_image[loc_valid_idxs] new_class_labels = new_class_labels[loc_valid_idxs] - + return newmasks_per_image.bool(), newscore_per_image, new_class_labels else: # filter @@ -344,7 +360,7 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla masks_per_image = predmask_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] class_labels = class_labels[loc_valid_idxs] - + loc_valid_idxs = scores_per_image > self.min_pseudo_mask_score_2 if loc_valid_idxs.any(): masks_per_image = masks_per_image[loc_valid_idxs] @@ -354,11 +370,11 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla return (masks_per_image > 0), scores_per_image, class_labels - + def _unique_assignment(self, masks_per_image, scores_per_image, prop_feats_per_image, mask_prop_feats=None): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label_during_clustering: - # unique assignment + # unique assignment predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() scoremap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = scoremap_per_image.unique() @@ -374,13 +390,13 @@ def _unique_assignment(self, masks_per_image, scores_per_image, prop_feats_per_i newmasks_per_image = newmasks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] prop_feats_per_image = prop_feats_per_image[loc_valid_idxs] - + loc_valid_idxs = scores_per_image > self.min_pseudo_mask_score_1 if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] prop_feats_per_image = prop_feats_per_image[loc_valid_idxs] - + return newmasks_per_image.bool(), scores_per_image, prop_feats_per_image else: # filter @@ -389,7 +405,7 @@ def _unique_assignment(self, masks_per_image, scores_per_image, prop_feats_per_i masks_per_image = masks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] prop_feats_per_image = prop_feats_per_image[loc_valid_idxs] - + loc_valid_idxs = scores_per_image > self.min_pseudo_mask_score_1 if loc_valid_idxs.any(): masks_per_image = masks_per_image[loc_valid_idxs] @@ -401,60 +417,60 @@ def _unique_assignment(self, masks_per_image, scores_per_image, prop_feats_per_i def prepare_targets(self, inputs, images): if "part_instances" in inputs[0]: - # evaluation + # evaluation part_targets = [x["part_instances"].to(self.device) for x in inputs] object_targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for part_targets_per_image, object_targets_per_image in zip(part_targets, object_targets): gt_mask = part_targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask gt_obj_mask = object_targets_per_image.gt_masks.tensor - padded_obj_mask = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), + padded_obj_mask = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), dtype=gt_obj_mask.dtype, device=gt_obj_mask.device) padded_obj_mask[:, : gt_obj_mask.shape[1], : gt_obj_mask.shape[2]] = gt_obj_mask - new_targets.append({"part_labels": part_targets_per_image.gt_classes.to(self.device), - "object_label": object_targets_per_image.gt_classes.to(self.device), + new_targets.append({"part_labels": part_targets_per_image.gt_classes.to(self.device), + "object_label": object_targets_per_image.gt_classes.to(self.device), "masks": padded_masks, "object_mask": padded_obj_mask}) else: #labeling targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for i, targets_per_image in enumerate(targets): gt_mask = targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask - new_targets.append({"object_label": targets_per_image.gt_classes.to(self.device), + new_targets.append({"object_label": targets_per_image.gt_classes.to(self.device), "masks": padded_masks, "object_mask": padded_masks}) - + return new_targets - + def register_classifier(self, centroids_dict): for cid, centroids in centroids_dict.items(): - num_cls, in_dim = centroids.shape + num_cls, in_dim = centroids.shape self.classifier[cid] = nn.Linear(in_dim, num_cls, bias=False).to(self.device) - self.classifier[cid].weight.data = centroids.to(self.device) - + self.classifier[cid].weight.data = centroids.to(self.device) + def use_classifier(self, features, cid): if cid not in self.classifier: raise ValueError("class ID {} not in classifier. ({})".format(cid, self.classifier.keys())) if self.classifier_metric == "l2": - # Efficient negative l2 distance implementation. + # Efficient negative l2 distance implementation. y = self.classifier[cid].weight.data xy = self.classifier[cid](features) # NxK - xx = (features * features).sum(dim=1)[:, None] # Nx1 + xx = (features * features).sum(dim=1)[:, None] # Nx1 yy = (y * y).sum(dim=1) # Kx1 return xy - xx - yy.t() @@ -462,7 +478,7 @@ def use_classifier(self, features, cid): elif self.classifier_metric == "dot": return self.classifier[cid](features) - + def instance_inference_with_classification(self, proposal_feats, mask_cls, mask_pred, target_mask, target_object_mask, target_label, vis=False): # mask_pred is already processed to have the same shape as original input image_size = mask_pred.shape[-2:] @@ -471,9 +487,9 @@ def instance_inference_with_classification(self, proposal_feats, mask_cls, mask_ object_scores = mask_cls.softmax(-1)[:, :1] cls_outputs = self.use_classifier(proposal_feats, target_label.item()) class_scores = cls_outputs.softmax(-1) # QxK - + # score = ranking score * confidence. - scores = object_scores * class_scores + scores = object_scores * class_scores topk = self.wandb_vis_topk if vis and not self.use_unique_per_pixel_label_during_labeling else self.test_topk_per_image labels = torch.arange(self.num_classes(target_label), device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten() scores_per_image, topk_indices = scores.flatten().topk(topk, sorted=False) @@ -488,34 +504,34 @@ def instance_inference_with_classification(self, proposal_feats, mask_cls, mask_ raise ValueError("Class mapping is not registered.") else: labels_per_image = labels[topk_indices] - + topk_indices = torch.div(topk_indices, self.num_classes(target_label), rounding_mode='floor') mask_pred = mask_pred[topk_indices] # refine part mask with object mask mask_pred = self.masking_with_object_mask(mask_pred, target_object_mask) - # unique mapping and merging + # unique mapping and merging mask_pred_bool, scores_per_image, labels_per_image = self._unique_assignment_with_classes(mask_pred, scores_per_image, labels_per_image) mask_pred_bool, scores_per_image, labels_per_image = \ self.match_gt_masks(mask_pred_bool, scores_per_image, labels_per_image, target_mask) if mask_pred_bool.shape[0] == 0: - # doesn't contribute to the evaluation. + # doesn't contribute to the evaluation. mask_pred_bool = mask_pred.new_zeros(1, *mask_pred.shape[1:]).bool() - scores_per_image = scores_per_image.new_zeros(1) + scores_per_image = scores_per_image.new_zeros(1) labels_per_image = scores_per_image.new_zeros(1).long() result = Instances(image_size) result.pred_masks = mask_pred_bool pred_masks_float = result.pred_masks.float() result.scores = scores_per_image - result.pred_classes = labels_per_image + result.pred_classes = labels_per_image return result - + def instance_inference_with_proposal_feats(self, proposal_feats, mask_cls, mask_pred, target_mask, target_object_mask, vis=False): # mask_pred is already processed to have the same shape as original input image_size = mask_pred.shape[-2:] @@ -529,19 +545,19 @@ def instance_inference_with_proposal_feats(self, proposal_feats, mask_cls, mask_ # get unique assignment if needed masks_per_image, scores_per_image, prop_feats_per_image = self._unique_assignment(masks_per_image, scores_per_image, prop_feats_per_image) - + # refine part masks with object mask if needed masks_per_image = self.masking_with_object_mask(masks_per_image, target_object_mask) return masks_per_image, scores_per_image, prop_feats_per_image - + def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): if self.num_iters % self.wandb_vis_period == 0: # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["predictions"].to(self.cpu_device) for r in processed_results] @@ -549,7 +565,7 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) visualizer = Partvisualizer(image, self.metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=instances) @@ -559,8 +575,9 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): visualizer = Partvisualizer(image, self.metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) - self.num_iters += 1 + self.num_iters += 1 + diff --git a/part_distillation/pixel_grouping_model.py b/part_distillation/pixel_grouping_model.py index 0eede50..cb8aeab 100644 --- a/part_distillation/pixel_grouping_model.py +++ b/part_distillation/pixel_grouping_model.py @@ -3,15 +3,16 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os -import numpy as np + +import os +import numpy as np import torch import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans from typing import Tuple, List from detectron2.config import configurable from detectron2.data import MetadataCatalog @@ -38,10 +39,11 @@ def __init__( distance_metric: str="l2", backbone_feature_key_list: List[str]=["res4"], num_superpixel_clusters: int=4, - feature_normalize: bool=False, + feature_normalize: bool=False, debug: bool=False, object_mask_type: str="detic_based", - wandb_vis_period: int=100 + wandb_vis_period: int=100, + use_wandb: bool=False, ): super().__init__() self.backbone = backbone @@ -52,31 +54,33 @@ def __init__( self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) self.cpu_device = torch.device("cpu") - # Clustering-related. + # Clustering-related. self.distance_metric = distance_metric self.backbone_feature_key_list = backbone_feature_key_list self.num_superpixel_clusters = num_superpixel_clusters self.feature_normalize = feature_normalize self.kmeans_module = KMeans(n_clusters=num_superpixel_clusters, random_state=0) self.wandb_vis_period = wandb_vis_period - self.num_test_iterations = 0 - + self.num_test_iterations = 0 + self.use_wandb = use_wandb + @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) - + return { "backbone": backbone, - "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, # Set to 32. + "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, # Set to 32. "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "distance_metric": cfg.PIXEL_GROUPING.DISTANCE_METRIC, "backbone_feature_key_list": cfg.PIXEL_GROUPING.BACKBONE_FEATURE_KEY_LIST, "num_superpixel_clusters": cfg.PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS, "feature_normalize": cfg.PIXEL_GROUPING.FEATURE_NORMALIZE, - "wandb_vis_period": cfg.WANDB.VIS_PERIOD_TEST, + "wandb_vis_period": cfg.WANDB.VIS_PERIOD_TEST, + "use_wandb": not cfg.WANDB.DISABLE_WANDB, "debug": cfg.PIXEL_GROUPING.DEBUG, } @@ -86,27 +90,27 @@ def device(self): def prepare_mask(self, inputs, images): - # evaluation + # evaluation part_targets = [x["part_instances"].to(self.device) for x in inputs] object_targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] - for part_targets_per_image, object_pargets_per_image in zip(part_targets, object_targets): + for part_targets_per_image, object_targets_per_image in zip(part_targets, object_targets): part_gt_mask = part_targets_per_image.gt_masks.tensor - part_padded_masks = torch.zeros((part_gt_mask.shape[0], h_pad, w_pad), + part_padded_masks = torch.zeros((part_gt_mask.shape[0], h_pad, w_pad), dtype=part_gt_mask.dtype, device=part_gt_mask.device) part_padded_masks[:, : part_gt_mask.shape[1], : part_gt_mask.shape[2]] = part_gt_mask - object_gt_mask = object_pargets_per_image.gt_masks.tensor - obj_padded_masks = torch.zeros((object_gt_mask.shape[0], h_pad, w_pad), + object_gt_mask = object_targets_per_image.gt_masks.tensor + obj_padded_masks = torch.zeros((object_gt_mask.shape[0], h_pad, w_pad), dtype=part_gt_mask.dtype, device=object_gt_mask.device) obj_padded_masks[:, : object_gt_mask.shape[1], : object_gt_mask.shape[2]] = object_gt_mask - new_targets.append({"part_labels": part_targets_per_image.gt_classes.to(self.device), - "object_label": object_pargets_per_image.gt_classes.to(self.device), + new_targets.append({"part_labels": part_targets_per_image.gt_classes.to(self.device), + "object_label": object_targets_per_image.gt_classes.to(self.device), "part_masks": part_padded_masks, "masks": obj_padded_masks}) - + return new_targets @@ -117,11 +121,11 @@ def _prepare_features(self, features): for k, v in feat_dict.items(): feat_dict[k] = F.interpolate(v, size=(H, W), mode="bilinear", align_corners=False) - + feat_out = torch.cat([feat_dict[k] for k in self.backbone_feature_key_list], dim=1) if self.feature_normalize: feat_out = F.normalize(feat_out, dim=1, p=2) - + return feat_out @@ -132,7 +136,7 @@ def forward(self, batched_inputs): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) - + targets = self.prepare_mask(batched_inputs, images) features = self.backbone(images.tensor) features = self._prepare_features(features) @@ -147,7 +151,7 @@ def forward(self, batched_inputs): for input_per_image, feature_per_image, feature_resized_per_image, image_size, targets_per_image in \ zip(batched_inputs, features, features_resized, images.image_sizes, targets): pseudo_label_list.append({}) - + height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) @@ -159,24 +163,24 @@ def forward(self, batched_inputs): masks = F.interpolate(masks[None].float(), size=feature_per_image.shape[-2:], mode="nearest")[0, 0].bool() pseudo_label = retry_if_cuda_oom(self.generate_part_segments)(input_per_image, feature_per_image, feature_resized_per_image, masks, masks_resized) - + instance = Instances(pseudo_label.shape[-2:]) - instance.pred_masks = pseudo_label + instance.pred_masks = pseudo_label instance.scores = pseudo_label.new_ones(pseudo_label.shape[0]) gt_instance = Instances(gt_part_masks.shape[-2:]) gt_instance.gt_masks = gt_part_masks gt_instance.pred_masks = gt_part_masks - pseudo_label_list[-1]["proposals"] = instance + pseudo_label_list[-1]["proposals"] = instance pseudo_label_list[-1]["gt_masks"] = gt_instance if comm.is_main_process(): - if self.num_test_iterations % self.wandb_vis_period == 0: + if self.use_wandb and (self.num_test_iterations % self.wandb_vis_period == 0): self.wandb_visualize(batched_inputs, images, pseudo_label_list) - self.num_test_iterations += 1 + self.num_test_iterations += 1 - return pseudo_label_list + return pseudo_label_list @@ -189,26 +193,26 @@ def get_pixel_grouping(self, feature_per_image, pred_mask): centroids = torch.tensor(centroids).float() else: centroids = data.new_zeros(1, feature_per_image.shape[0]) - - return centroids + + return centroids def measure_distance(self, A, B): if self.distance_metric == "dot": - return A @ B.T + return A @ B.T elif self.distance_metric == "l2": return 2 * A @ B.T - (A * A).sum(dim=1)[:, None] - (B * B).sum(1, keepdim=True).t() - + def generate_part_segments(self, input_per_image, feature_per_image, feature_resized_per_image, object_mask, object_mask_resized): centroids = self.get_pixel_grouping(feature_per_image, object_mask) feature_prop = feature_resized_per_image[:, object_mask_resized].transpose(0, 1).contiguous().cpu() pred_labels = self.measure_distance(feature_prop, centroids).topk(1, dim=1)[1].flatten() + 1 - mask = feature_prop.new_zeros(feature_resized_per_image.shape[-2:]).long() - mask[torch.where(object_mask_resized==True)] = pred_labels + mask = feature_prop.new_zeros(feature_resized_per_image.shape[-2:]).long() + mask[torch.where(object_mask_resized==True)] = pred_labels pred_labels_unique = pred_labels.unique() binary_mask = mask.new_zeros(len(pred_labels_unique), *feature_resized_per_image.shape[-2:]).bool() # PxHxW @@ -216,13 +220,13 @@ def generate_part_segments(self, input_per_image, feature_per_image, feature_res binary_mask[i] = mask == plbl return binary_mask - - + + def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): - # NOTE: Hack to use input as visualization image. + # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["proposals"].to(self.cpu_device) for r in processed_results] @@ -230,12 +234,12 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) targets = Instances(instances.pred_masks.shape[-2:]) visualizer = Partvisualizer(image, None, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) @@ -244,3 +248,5 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image_pd = wandb.Image(vis_output.get_image()) wandb.log({"predictions": image_pd}) + + diff --git a/part_distillation/proposal_generation_model.py b/part_distillation/proposal_generation_model.py index 5d31200..1031749 100644 --- a/part_distillation/proposal_generation_model.py +++ b/part_distillation/proposal_generation_model.py @@ -3,15 +3,16 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os -import numpy as np + +import os +import numpy as np import torch import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans from typing import Tuple, List from detectron2.config import configurable from detectron2.data import MetadataCatalog @@ -38,9 +39,10 @@ def __init__( distance_metric: str="l2", backbone_feature_key_list: List[str]=["res4"], num_superpixel_clusters: int=4, - feature_normalize: bool=False, + feature_normalize: bool=False, wandb_vis_period: int=100, debug: bool=False, + use_wandb: bool=False, ): super().__init__() self.backbone = backbone @@ -51,36 +53,37 @@ def __init__( self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) self.cpu_device = torch.device("cpu") - # clustering-related. + # clustering-related. self.distance_metric = distance_metric self.backbone_feature_key_list = backbone_feature_key_list self.num_superpixel_clusters = num_superpixel_clusters self.feature_normalize = feature_normalize self.kmeans_module = KMeans(n_clusters=num_superpixel_clusters, random_state=0) self.metadata = MetadataCatalog.get(dataset_name) - self.num_test_iterations = 0 + self.num_test_iterations = 0 self.wandb_vis_period = wandb_vis_period self.debug = debug + self.use_wandb = use_wandb self.root_save_path = self.metadata.save_path if comm.is_main_process(): if not os.path.exists(self.root_save_path): os.makedirs(self.root_save_path) - + for fname in self.metadata.class_codes: folder_path = os.path.join(self.root_save_path, fname) if not os.path.exists(folder_path): os.makedirs(folder_path) comm.synchronize() - + @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) - + return { "backbone": backbone, - "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, # Set to 32. + "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, # Set to 32. "dataset_name": cfg.PROPOSAL_GENERATION.DATASET_NAME, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, @@ -88,8 +91,9 @@ def from_config(cls, cfg): "backbone_feature_key_list": cfg.PROPOSAL_GENERATION.BACKBONE_FEATURE_KEY_LIST, "num_superpixel_clusters": cfg.PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS, "feature_normalize": cfg.PROPOSAL_GENERATION.FEATURE_NORMALIZE, - "wandb_vis_period": cfg.WANDB.VIS_PERIOD_TEST, + "wandb_vis_period": cfg.WANDB.VIS_PERIOD_TEST, "debug": cfg.PROPOSAL_GENERATION.DEBUG, + "use_wandb": not cfg.WANDB.DISABLE_WANDB } @property @@ -98,20 +102,20 @@ def device(self): def prepare_mask(self, inputs, images): - # evaluation + # evaluation targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for targets_per_image in targets: gt_mask = targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask - + new_targets.append({"masks": padded_masks}) - + return new_targets - + def _prepare_features(self, features): @@ -120,7 +124,7 @@ def _prepare_features(self, features): for k, v in feat_dict.items(): feat_dict[k] = F.interpolate(v, size=(H, W), mode="bilinear", align_corners=False) - + feat_out = torch.cat([feat_dict[k] for k in self.backbone_feature_key_list], dim=1) if self.feature_normalize: feat_out = F.normalize(feat_out, dim=1, p=2) @@ -134,7 +138,7 @@ def forward(self, batched_inputs): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) - + targets = self.prepare_mask(batched_inputs, images) features = self.backbone(images.tensor) features = self._prepare_features(features) @@ -148,7 +152,7 @@ def forward(self, batched_inputs): pseudo_label_list = [] for input_per_image, feature_per_image, feature_resized_per_image, image_size, targets_per_image in \ zip(batched_inputs, features, features_resized, images.image_sizes, targets): - + height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) @@ -166,19 +170,20 @@ def forward(self, batched_inputs): pseudo_label_list.append({}) instance = Instances(pseudo_label.shape[-2:]) - instance.pred_masks = pseudo_label + instance.pred_masks = pseudo_label instance.scores = pseudo_label.new_ones(pseudo_label.shape[0]) gt_instance = Instances(pseudo_label.shape[-2:]) gt_instance.pred_masks = pseudo_label - pseudo_label_list[-1]["proposals"] = instance + pseudo_label_list[-1]["proposals"] = instance pseudo_label_list[-1]["gt_masks"] = gt_instance - if comm.is_main_process() and self.num_test_iterations % self.wandb_vis_period == 0: + if comm.is_main_process() and self.use_wandb \ + and self.num_test_iterations % self.wandb_vis_period == 0: if len(pseudo_label_list) > 0: self.wandb_visualize(batched_inputs, images, pseudo_label_list) - self.num_test_iterations += 1 + self.num_test_iterations += 1 @@ -213,12 +218,12 @@ def _get_superpixels(self, feature_per_image, pred_mask): def _measure_distance(self, A, B): if self.distance_metric == "dot": - return A @ B.T + return A @ B.T elif self.distance_metric == "l2": return 2 * A @ B.T - (A * A).sum(dim=1)[:, None] - (B * B).sum(1, keepdim=True).t() - + def generate_pseudo_labels(self, input_per_image, feature_per_image, feature_resized_per_image, object_mask, object_mask_resized): # clustering is done on small resolution (1/8). centroids = self._get_superpixels(feature_per_image, object_mask) @@ -226,8 +231,8 @@ def generate_pseudo_labels(self, input_per_image, feature_per_image, feature_res feature_prop = feature_resized_per_image[:, object_mask_resized].transpose(0, 1).contiguous().cpu() pred_labels = self._measure_distance(feature_prop, centroids).topk(1, dim=1)[1].flatten() + 1 - mask = feature_prop.new_zeros(feature_resized_per_image.shape[-2:]).long() - mask[torch.where(object_mask_resized==True)] = pred_labels + mask = feature_prop.new_zeros(feature_resized_per_image.shape[-2:]).long() + mask[torch.where(object_mask_resized==True)] = pred_labels pred_labels_unique = pred_labels.unique() binary_mask = mask.new_zeros(len(pred_labels_unique), *feature_resized_per_image.shape[-2:]).bool() # PxHxW @@ -235,13 +240,13 @@ def generate_pseudo_labels(self, input_per_image, feature_per_image, feature_res binary_mask[i] = mask == plbl return binary_mask - - + + def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): - # NOTE: Hack to use input as visualization image. + # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["proposals"].to(self.cpu_device) for r in processed_results] @@ -249,12 +254,12 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) targets = Instances(instances.pred_masks.shape[-2:]) visualizer = Partvisualizer(image, None, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) @@ -263,3 +268,5 @@ def wandb_visualize(self, inputs, images, processed_results, opacity=0.8): image_pd = wandb.Image(vis_output.get_image()) wandb.log({"predictions": image_pd}) + + \ No newline at end of file diff --git a/part_distillation/proposal_model.py b/part_distillation/proposal_model.py index 8fae92c..3e8ebcb 100644 --- a/part_distillation/proposal_model.py +++ b/part_distillation/proposal_model.py @@ -3,12 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os -import logging + +import os +import logging import torch -import numpy as np +import numpy as np import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F @@ -25,7 +26,7 @@ from .modeling.criterion import SetCriterion from .modeling.matcher import HungarianMatcher from .utils.utils import Partvisualizer, get_iou_all_cocoapi - +from torch.cuda.amp import autocast @META_ARCH_REGISTRY.register() class ProposalModel(nn.Module): @@ -56,6 +57,7 @@ def __init__( minimum_pseudo_mask_score: float=0.0, minimum_pseudo_mask_ratio: float=0.0, apply_masking_with_object_mask: bool=True, + fp16: bool=False, ): super().__init__() self.backbone = backbone @@ -79,28 +81,30 @@ def __init__( self.wandb_vis_period_train = wandb_vis_period_train self.wandb_vis_period_test = wandb_vis_period_test self.wandb_vis_topk = wandb_vis_topk - self.num_train_iterations = 0 - self.num_test_iterations = 0 + self.num_train_iterations = 0 + self.num_test_iterations = 0 self.use_unique_per_pixel_label = use_unique_per_pixel_label self.minimum_pseudo_mask_score = minimum_pseudo_mask_score self.minimum_pseudo_mask_ratio = minimum_pseudo_mask_ratio - self.apply_masking_with_object_mask = apply_masking_with_object_mask - + self.apply_masking_with_object_mask = apply_masking_with_object_mask + + # half precision + self.fp16 = fp16 def set_postprocess_type(self, postprocess_type): if postprocess_type == "semseg": - self.use_unique_per_pixel_label = True + self.use_unique_per_pixel_label = True elif postprocess_type == "prop": - self.use_unique_per_pixel_label = False + self.use_unique_per_pixel_label = False elif postprocess_type == "prop-filtered": self.use_unique_per_pixel_label = False - self.minimum_pseudo_mask_score = 0.3 - + self.minimum_pseudo_mask_score = 0.3 + def reset_postprocess_type(self, flag, score_thres): - self.use_unique_per_pixel_label = flag + self.use_unique_per_pixel_label = flag self.minimum_pseudo_mask_score = score_thres - + @classmethod def from_config(cls, cfg): @@ -155,7 +159,7 @@ def from_config(cls, cfg): "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, - # wandb + # wandb "wandb_vis_period_train": cfg.WANDB.VIS_PERIOD_TRAIN, "wandb_vis_period_test": cfg.WANDB.VIS_PERIOD_TEST, "wandb_vis_topk": cfg.WANDB.VIS_TOPK, @@ -167,8 +171,9 @@ def from_config(cls, cfg): "apply_masking_with_object_mask": cfg.PROPOSAL_LEARNING.APPLY_MASKING_WITH_OBJECT_MASK, "minimum_pseudo_mask_ratio": cfg.PROPOSAL_LEARNING.MIN_AREA_RATIO, "minimum_pseudo_mask_score": cfg.PROPOSAL_LEARNING.MIN_SCORE, + "fp16": cfg.FP16, } - + @property def device(self): @@ -180,7 +185,13 @@ def forward(self, batched_inputs): images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) - features = self.backbone(images.tensor) + if self.fp16: + with autocast(): + # print("using fp16 correctly.", flush=True) + features = self.backbone(images.tensor.half()) + features = {k: v.float() for k, v in features.items()} + else: + features = self.backbone(images.tensor) targets = self.prepare_targets(batched_inputs, images) outputs = self.sem_seg_head(features) @@ -194,13 +205,13 @@ def forward(self, batched_inputs): else: # remove this loss if not specified in `weight_dict` losses.pop(k) - + if self.use_wandb and comm.is_main_process(): if self.num_train_iterations % self.wandb_vis_period_train == 0: processed_results_vis = self.inference(batched_inputs, targets, images, outputs, vis=True) self.wandb_visualize(batched_inputs, images, processed_results_vis, is_training=True) del processed_results_vis - self.num_train_iterations += 1 + self.num_train_iterations += 1 return losses else: processed_results = self.inference(batched_inputs, targets, images, outputs, vis=False) @@ -209,7 +220,7 @@ def forward(self, batched_inputs): processed_results_vis = self.inference(batched_inputs, targets, images, outputs, vis=True) self.wandb_visualize(batched_inputs, images, processed_results_vis, is_training=False) del processed_results_vis - self.num_test_iterations += 1 + self.num_test_iterations += 1 del batched_inputs, features, outputs, targets torch.cuda.empty_cache() @@ -234,8 +245,8 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): mask_cls_results, mask_pred_results, targets, batched_inputs, images.image_sizes )): # NOTE: Unlike standard pipeline, we provide gt label as input for inference. - # This reshapes the labels to input size already, so we want to reshape - # both gts and predictions to the original image size. + # This reshapes the labels to input size already, so we want to reshape + # both gts and predictions to the original image size. height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) @@ -244,7 +255,7 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): target_masks = retry_if_cuda_oom(sem_seg_postprocess)(target["masks"].float(), image_size, height, width).bool() target_object_masks = retry_if_cuda_oom(sem_seg_postprocess)(target["object_masks"].float(), image_size, height, width).bool() mask_cls_result = mask_cls_result.to(mask_pred_result) - + processed_results.append({}) instance_r = self.instance_inference(mask_cls_result, mask_pred_result, target_masks, target_object_masks, target["labels"], vis=vis) target_inst = Instances(target_masks.shape[-2:]) @@ -252,8 +263,8 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): target_inst.gt_classes = target["labels"] # For visualization - target_inst.pred_masks = target_masks - target_inst.pred_classes = target["labels"] + target_inst.pred_masks = target_masks + target_inst.pred_classes = target["labels"] processed_results[-1]["proposals"] = instance_r processed_results[-1]["gt_masks"] = target_inst @@ -262,14 +273,14 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): return processed_results + - - def _unique_assignment(self, masks_per_image, scores_per_image): + def unique_assignment(self, masks_per_image, scores_per_image): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label: binmask_per_image = masks_per_image > 0 predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() - + scoremap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = scoremap_per_image.unique() newmasks_per_image = masks_per_image.new_zeros(len(query_indexs_list), *scoremap_per_image.shape[1:]) @@ -280,12 +291,13 @@ def _unique_assignment(self, masks_per_image, scores_per_image): if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] - + + # print(f'score thres: {self.minimum_pseudo_mask_score}', flush=True) loc_valid_idxs = scores_per_image > self.minimum_pseudo_mask_score if loc_valid_idxs.any(): newmasks_per_image = newmasks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] - + return newmasks_per_image.bool(), scores_per_image else: @@ -293,12 +305,12 @@ def _unique_assignment(self, masks_per_image, scores_per_image): if loc_valid_idxs.any(): masks_per_image = masks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] - + loc_valid_idxs = scores_per_image > self.minimum_pseudo_mask_score if loc_valid_idxs.any(): masks_per_image = masks_per_image[loc_valid_idxs] scores_per_image = scores_per_image[loc_valid_idxs] - + return (masks_per_image > 0), scores_per_image @@ -312,20 +324,19 @@ def prepare_targets(self, inputs, images): def _prepare_pseudo_targets(self, inputs, images): """ - This is used when training with ImageNet. + This is used when training with ImageNet. """ pseudo_targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for input_per_image, pseudo_targets_per_image in zip(inputs, pseudo_targets): if pseudo_targets_per_image.has("gt_masks"): gt_pseudo_masks = pseudo_targets_per_image.gt_masks.tensor - padded_pseudo_masks = torch.zeros((gt_pseudo_masks.shape[0], h_pad, w_pad), + padded_pseudo_masks = torch.zeros((gt_pseudo_masks.shape[0], h_pad, w_pad), dtype=gt_pseudo_masks.dtype, device=gt_pseudo_masks.device) padded_pseudo_masks[:, : gt_pseudo_masks.shape[1], : gt_pseudo_masks.shape[2]] = gt_pseudo_masks n = padded_pseudo_masks.shape[0] - - # During training with ImageNet, we assume each image has only one object. + # During training with ImageNet, we assume each image has only one object. object_masks = padded_pseudo_masks.sum(0, keepdim=True) new_targets.append({"labels": torch.zeros(n).long().to(self.device), # All-zeros "masks": padded_pseudo_masks, @@ -334,7 +345,7 @@ def _prepare_pseudo_targets(self, inputs, images): }) else: raise ValueError("pseudo label without masks.") - + return new_targets @@ -343,36 +354,36 @@ def _prepare_gt_targets(self, inputs, images): targets = [x["part_instances"].to(self.device) for x in inputs] object_targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for object_targets_per_image, targets_per_image in zip(object_targets, targets): gt_mask = targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask n = padded_masks.shape[0] gt_obj_mask = object_targets_per_image.gt_masks.tensor - padded_obj_masks = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), + padded_obj_masks = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), dtype=gt_obj_mask.dtype, device=gt_obj_mask.device) padded_obj_masks[:, : gt_obj_mask.shape[1], : gt_obj_mask.shape[2]] = gt_obj_mask - labels = targets_per_image.gt_classes.to(self.device) + labels = targets_per_image.gt_classes.to(self.device) new_targets.append({"labels": labels, "masks": padded_masks, - # "gt_object_class": object_targets_per_image.gt_classes.to(self.device), + # "gt_object_class": object_targets_per_image.gt_classes.to(self.device), "object_masks": padded_obj_masks, }) - + return new_targets def masking_with_object_mask(self, masks_per_image, target_masks): if self.apply_masking_with_object_mask: - object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() + object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() - return masks_per_image * object_target_mask + return masks_per_image * object_target_mask else: return masks_per_image @@ -390,36 +401,37 @@ def instance_inference(self, mask_cls, mask_pred, target_masks, target_object_ma mask_pred = mask_pred[topk_indices] mask_pred = self.masking_with_object_mask(mask_pred, target_object_masks) - mask_pred_bool, scores_per_image = self._unique_assignment(mask_pred, scores_per_image) + mask_pred_bool, scores_per_image = self.unique_assignment(mask_pred, scores_per_image) mask_pred_bool, scores_per_image, gt_part_labels = \ self.match_gt_labels(mask_pred_bool, scores_per_image, target_masks, target_labels) if mask_pred_bool.shape[0] == 0: - # doesn't contribute to the evaluation. + # doesn't contribute to the evaluation. mask_pred_bool = mask_pred.new_zeros(1, *mask_pred.shape[1:]).bool() - scores_per_image = scores_per_image.new_zeros(1) + scores_per_image = scores_per_image.new_zeros(1) gt_part_labels = gt_part_labels.new_zeros(1) - + result = Instances(image_size) + # mask (before sigmoid) result.pred_masks = mask_pred_bool pred_masks_float = result.pred_masks.float() - result.pred_classes = gt_part_labels # not used (vis only) + result.pred_classes = gt_part_labels # not used (vis only) result.scores = scores_per_image - + return result def register_metadata(self, dataset_name): self.logger.info("{} is registered for evaluation.".format(dataset_name)) self.metadata = MetadataCatalog.get(dataset_name) - + def match_gt_labels(self, masks_per_image, scores_per_image, target_masks, target_labels): pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > 0.001).flatten() @@ -435,7 +447,7 @@ def match_semseg_gt_labels(self, masks_per_image, scores_per_image, prop_feats_p pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > 0.001).flatten() @@ -451,7 +463,7 @@ def match_semseg_gt_labels(self, masks_per_image, scores_per_image, prop_feats_p def wandb_visualize(self, inputs, images, processed_results, is_training, opacity=0.8): # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["proposals"].to(self.cpu_device) for r in processed_results] @@ -459,9 +471,9 @@ def wandb_visualize(self, inputs, images, processed_results, is_training, opacit image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) - metadata = self.metadata if not is_training else None + metadata = self.metadata if not is_training else None visualizer = Partvisualizer(image, metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=instances) @@ -470,6 +482,8 @@ def wandb_visualize(self, inputs, images, processed_results, is_training, opacit visualizer = Partvisualizer(image, metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) + + diff --git a/part_distillation/supervised_model.py b/part_distillation/supervised_model.py index 1887a83..a5e9a52 100644 --- a/part_distillation/supervised_model.py +++ b/part_distillation/supervised_model.py @@ -3,12 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os -import logging + +import os +import logging import torch -import numpy as np +import numpy as np import detectron2.utils.comm as comm -import wandb +import wandb from torch import nn from torch.nn import functional as F @@ -77,11 +78,11 @@ def __init__( self.wandb_vis_period_train = wandb_vis_period_train self.wandb_vis_period_test = wandb_vis_period_test self.wandb_vis_topk = wandb_vis_topk - self.num_train_iterations = 0 - self.num_test_iterations = 0 + self.num_train_iterations = 0 + self.num_test_iterations = 0 self.use_unique_per_pixel_label = use_unique_per_pixel_label - self.apply_masking_with_object_mask = apply_masking_with_object_mask + self.apply_masking_with_object_mask = apply_masking_with_object_mask self.class_agnostic_learning = class_agnostic_learning self.class_agnostic_inference = class_agnostic_inference @@ -140,7 +141,7 @@ def from_config(cls, cfg): "pixel_std": cfg.MODEL.PIXEL_STD, # inference "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, - # wandb + # wandb "wandb_vis_period_train": cfg.WANDB.VIS_PERIOD_TRAIN, "wandb_vis_period_test": cfg.WANDB.VIS_PERIOD_TEST, "wandb_vis_topk": cfg.WANDB.VIS_TOPK, @@ -149,10 +150,10 @@ def from_config(cls, cfg): "use_unique_per_pixel_label": cfg.SUPERVISED_MODEL.USE_PER_PIXEL_LABEL, "apply_masking_with_object_mask": cfg.SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK, "class_agnostic_learning": cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING, - "class_agnostic_inference": cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE, + "class_agnostic_inference": cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, } - + @property def device(self): @@ -178,13 +179,13 @@ def forward(self, batched_inputs): else: # remove this loss if not specified in `weight_dict` losses.pop(k) - + if self.use_wandb and comm.is_main_process(): if self.num_train_iterations % self.wandb_vis_period_train == 0: processed_results_vis = self.inference(batched_inputs, targets, images, outputs, vis=True) self.wandb_visualize(batched_inputs, images, processed_results_vis, is_training=True) del processed_results_vis - self.num_train_iterations += 1 + self.num_train_iterations += 1 return losses else: processed_results = self.inference(batched_inputs, targets, images, outputs, vis=False) @@ -193,7 +194,7 @@ def forward(self, batched_inputs): processed_results_vis = self.inference(batched_inputs, targets, images, outputs, vis=True) self.wandb_visualize(batched_inputs, images, processed_results_vis, is_training=False) del processed_results_vis - self.num_test_iterations += 1 + self.num_test_iterations += 1 return processed_results @@ -215,8 +216,8 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): mask_cls_results, mask_pred_results, targets, batched_inputs, images.image_sizes )): # NOTE: Unlike standard pipeline, we provide gt label as input for inference. - # This reshapes the labels to input size already, so we want to reshape - # both gts and predictions to the original image size. + # This reshapes the labels to input size already, so we want to reshape + # both gts and predictions to the original image size. height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) @@ -224,22 +225,22 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): target_masks = retry_if_cuda_oom(sem_seg_postprocess)(target["masks"].float(), image_size, height, width).bool() target_object_masks = retry_if_cuda_oom(sem_seg_postprocess)(target["object_masks"].float(), image_size, height, width).bool() mask_cls_result = mask_cls_result.to(mask_pred_result) - + processed_results.append({}) if self.class_agnostic_learning or self.class_agnostic_inference: instance_r = self.instance_inference(mask_cls_result, mask_pred_result, target_masks, target_object_masks, \ target["labels"], vis=vis) else: instance_r = self.instance_inference_with_classification(mask_cls_result, mask_pred_result, target_masks, \ - target_object_masks, target["labels"], vis=vis) + target_object_masks, target["labels"], vis=vis) target_inst = Instances(target_masks.shape[-2:]) target_inst.gt_masks = target_masks target_inst.gt_classes = target["labels"] # For visualization - target_inst.pred_masks = target_masks - target_inst.pred_classes = target["labels"] - + target_inst.pred_masks = target_masks + target_inst.pred_classes = target["labels"] + processed_results[-1]["predictions"] = instance_r processed_results[-1]["gt_instances"] = target_inst processed_results[-1]["proposals"] = instance_r @@ -247,16 +248,16 @@ def inference(self, batched_inputs, targets, images, outputs, vis=False): return processed_results - + def _unique_assignment(self, masks_per_image, scores_per_image): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label: binmask_per_image = masks_per_image > 0 - + # predmask_per_image = masks_per_image.sigmoid() predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() - + scoremap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = scoremap_per_image.unique() newmasks_per_image = masks_per_image.new_zeros(len(query_indexs_list), *scoremap_per_image.shape[1:]) @@ -284,19 +285,19 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas labels_per_image = labels[topk_indices] topk_indices = torch.div(topk_indices, self.num_classes, rounding_mode='floor') - mask_pred = mask_pred[topk_indices] # KxHxW + mask_pred = mask_pred[topk_indices] # KxHxW mask_pred = self.masking_with_object_mask(mask_pred, target_object_masks) - # unique mapping and merging + # unique mapping and merging mask_pred_bool, scores_per_image, labels_per_image = self._unique_assignment_with_classes(mask_pred, scores_per_image, labels_per_image) mask_pred_bool, scores_per_image, labels_per_image, gt_part_labels = \ self.match_gt_labels_with_classification(mask_pred_bool, scores_per_image, labels_per_image, target_masks, target_labels) - + if mask_pred_bool.shape[0] == 0: - # Doesn't contribute to the evaluation. + # Doesn't contribute to the evaluation. mask_pred_bool = mask_pred.new_zeros(1, *mask_pred.shape[1:]).bool() - scores_per_image = scores_per_image.new_zeros(1) + scores_per_image = scores_per_image.new_zeros(1) labels_per_image = scores_per_image.new_ones(1).long() * self.num_classes gt_part_labels = scores_per_image.new_ones(1).long() * self.num_classes @@ -305,7 +306,7 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas pred_masks_float = result.pred_masks.float() result.scores = scores_per_image - result.pred_classes = labels_per_image + result.pred_classes = labels_per_image return result @@ -314,7 +315,7 @@ def instance_inference_with_classification(self, mask_cls, mask_pred, target_mas def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, class_labels): obj_map_per_image = masks_per_image.topk(1, dim=0)[0] > 0. if self.use_unique_per_pixel_label: - # segmentation + # segmentation predmask_per_image = scores_per_image[:, None, None] * masks_per_image.sigmoid() labelmap_per_image = predmask_per_image.topk(1, dim=0)[1] query_indexs_list = labelmap_per_image.unique() @@ -333,7 +334,7 @@ def _unique_assignment_with_classes(self, masks_per_image, scores_per_image, cla newscore_per_image[i] = scores_per_image[class_labels == cid].topk(1, dim=0)[0].flatten() return newmasks_per_image.bool(), newscore_per_image, query_indexs_list - else: + else: return (masks_per_image > 0), scores_per_image, class_labels @@ -341,17 +342,17 @@ def prepare_targets(self, inputs, images): targets = [x["part_instances"].to(self.device) for x in inputs] object_targets = [x["instances"].to(self.device) for x in inputs] - h_pad, w_pad = images.tensor.shape[-2:] + h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for object_targets_per_image, targets_per_image in zip(object_targets, targets): gt_mask = targets_per_image.gt_masks.tensor - padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), + padded_masks = torch.zeros((gt_mask.shape[0], h_pad, w_pad), dtype=gt_mask.dtype, device=gt_mask.device) padded_masks[:, : gt_mask.shape[1], : gt_mask.shape[2]] = gt_mask n = padded_masks.shape[0] gt_obj_mask = object_targets_per_image.gt_masks.tensor - padded_obj_masks = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), + padded_obj_masks = torch.zeros((gt_obj_mask.shape[0], h_pad, w_pad), dtype=gt_obj_mask.dtype, device=gt_obj_mask.device) padded_obj_masks[:, : gt_obj_mask.shape[1], : gt_obj_mask.shape[2]] = gt_obj_mask @@ -361,15 +362,15 @@ def prepare_targets(self, inputs, images): "masks": padded_masks, "object_masks": padded_obj_masks, }) - + return new_targets - + def masking_with_object_mask(self, masks_per_image, target_masks): if self.apply_masking_with_object_mask: - object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() + object_target_mask = target_masks.sum(dim=0, keepdim=True).bool() - return masks_per_image * object_target_mask + return masks_per_image * object_target_mask else: return masks_per_image @@ -382,9 +383,9 @@ def instance_inference(self, mask_cls, mask_pred, target_masks, target_object_ma # [Q, K=1] topk = self.wandb_vis_topk if vis and not self.use_unique_per_pixel_label else self.test_topk_per_image scores = mask_cls.softmax(-1)[:, :-1] - + if self.class_agnostic_learning: - scores = scores.flatten() # Binary + scores = scores.flatten() # Binary else: scores = scores.topk(1, dim=1)[0].flatten() # Use the top confidence score. (proposal eval only.) scores_per_image, topk_indices = scores.topk(topk, sorted=False) @@ -397,20 +398,20 @@ def instance_inference(self, mask_cls, mask_pred, target_masks, target_object_ma self.match_gt_labels(mask_pred_bool, scores_per_image, target_masks, target_labels) if mask_pred_bool.shape[0] == 0: - # Doesn't contribute to the evaluation. + # Doesn't contribute to the evaluation. mask_pred_bool = mask_pred.new_zeros(1, *mask_pred.shape[1:]).bool() - scores_per_image = scores_per_image.new_zeros(1) + scores_per_image = scores_per_image.new_zeros(1) gt_part_labels = gt_part_labels.new_zeros(1) - + result = Instances(image_size) # mask (before sigmoid) result.pred_masks = mask_pred_bool pred_masks_float = result.pred_masks.float() # calculate average mask prob - result.pred_classes = gt_part_labels + result.pred_classes = gt_part_labels result.scores = scores_per_image - + return result def register_metadata(self, dataset_name): @@ -418,13 +419,13 @@ def register_metadata(self, dataset_name): self.metadata = MetadataCatalog.get(dataset_name) if self.metadata.get("part_classes") is None and self.metadata.get("thing_classes") is not None: self.metadata.part_classes = self.metadata.thing_classes - + def match_gt_labels(self, masks_per_image, scores_per_image, target_masks, target_labels): pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > 0.001).flatten() @@ -439,7 +440,7 @@ def match_gt_labels_with_classification(self, masks_per_image, scores_per_image, pairwise_mask_ious = get_iou_all_cocoapi(masks_per_image, target_masks) top1_ious, top1_idx = pairwise_mask_ious.topk(1, dim=1) - + top1_idx = top1_idx.flatten() fg_idxs = (top1_ious > 0.001).flatten() @@ -454,7 +455,7 @@ def match_gt_labels_with_classification(self, masks_per_image, scores_per_image, def wandb_visualize(self, inputs, images, processed_results, is_training, opacity=0.8): # NOTE: Hack to use input as visualization image. images_raw = [x["image"].float().to(self.cpu_device) for x in inputs] - images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) + images_vis = [retry_if_cuda_oom(sem_seg_postprocess)(img, img_sz, x.get("height", img_sz[0]), x.get("width", img_sz[1])) for img, img_sz, x in zip(images_raw, images.image_sizes, inputs)] images_vis = [img.to(self.cpu_device) for img in images_vis] result_vis = [r["predictions"].to(self.cpu_device) for r in processed_results] @@ -462,9 +463,9 @@ def wandb_visualize(self, inputs, images, processed_results, is_training, opacit image, instances, targets = images_vis[0], result_vis[0], target_vis[0] image = image.permute(1, 2, 0).to(torch.uint8) white = np.ones(image.shape) * 255 - image = image * opacity + white * (1-opacity) + image = image * opacity + white * (1-opacity) - metadata = self.metadata if not is_training else None + metadata = self.metadata if not is_training else None visualizer = Partvisualizer(image, metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=instances) @@ -473,6 +474,7 @@ def wandb_visualize(self, inputs, images, processed_results, is_training, opacit visualizer = Partvisualizer(image, metadata, instance_mode=ColorMode.IMAGE) vis_output = visualizer.draw_instance_predictions(predictions=targets) - + image_gt = wandb.Image(vis_output.get_image()) wandb.log({"ground_truths": image_gt}) + diff --git a/part_distillation/utils/misc.py b/part_distillation/utils/misc.py index f418b94..874d980 100644 --- a/part_distillation/utils/misc.py +++ b/part_distillation/utils/misc.py @@ -1,8 +1,4 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - +# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py """ Misc functions, including distributed helpers. diff --git a/part_distillation/utils/utils.py b/part_distillation/utils/utils.py index f0167b5..d859ea9 100644 --- a/part_distillation/utils/utils.py +++ b/part_distillation/utils/utils.py @@ -3,10 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + from pycocotools import mask as mask_util from detectron2.utils.visualizer import ColorMode, Visualizer, GenericMask, _create_text_labels -import numpy as np -import torch +import numpy as np +import torch import pydensecrf.densecrf as dcrf import pydensecrf.utils as dcrf_utils @@ -29,7 +30,7 @@ def proposals_to_coco_json(binary_mask): # the pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") - return [{"segmentation": rle} for rle in rles] + return [{"segmentation": rle} for rle in rles] def get_iou_all_cocoapi(pr_masks, gt_masks): @@ -54,21 +55,21 @@ def dense_crf(image, label, n_labels, p=0.7, t=5, sd1=3, sd2=5, sc=13, compat1=3 c = image.shape[2] h = image.shape[0] w = image.shape[1] - + d = dcrf.DenseCRF2D(w, h, n_labels) U = dcrf_utils.unary_from_labels(labels, n_labels, gt_prob=p, zero_unsure=False) d.setUnaryEnergy(U) - + # This adds the color-independent term, features are the locations only. feats = dcrf_utils.create_pairwise_gaussian(sdims=(sd1, sd1), shape=(h, w)) d.addPairwiseEnergy(feats, compat=compat1, kernel=dcrf.DIAG_KERNEL, normalization=dcrf.NORMALIZE_SYMMETRIC) # This adds the color-dependent term, i.e. features are (x,y,r,g,b). - feats = dcrf_utils.create_pairwise_bilateral(sdims=(sd2, sd2), schan=(sc, sc, sc), + feats = dcrf_utils.create_pairwise_bilateral(sdims=(sd2, sd2), schan=(sc, sc, sc), img=image, chdim=2) - d.addPairwiseEnergy(feats, compat=compat2, + d.addPairwiseEnergy(feats, compat=compat2, kernel=dcrf.DIAG_KERNEL, normalization=dcrf.NORMALIZE_SYMMETRIC) @@ -123,3 +124,5 @@ def draw_instance_predictions(self, predictions): alpha=alpha, ) return self.output + + diff --git a/part_distillation_demo.py b/part_distillation_demo.py new file mode 100644 index 0000000..58c30e0 --- /dev/null +++ b/part_distillation_demo.py @@ -0,0 +1,276 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +# From https://github.com/facebookresearch/Detic/blob/main/demo.py . +# Modified by Jang Hyun Cho. + +import argparse +import glob +import multiprocessing as mp +import numpy as np +import os +import tempfile +import time +import warnings +import cv2 +import tqdm +import sys +import mss + +from detectron2.config import get_cfg +from detectron2.data.detection_utils import read_image +from detectron2.utils.logger import setup_logger +from detectron2.projects.deeplab import add_deeplab_config + + +sys.path.append('Detic/third_party/CenterNet2') +from centernet.config import add_centernet_config +from Detic.detic.config import add_detic_config + +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_proposal_learning_config, + add_custom_datasets_config) + +from part_distillation.demo.part_distillation_predictor import PartVisualizationDemo + +# Fake a video capture object OpenCV style - half width, half height of first screen using MSS +class ScreenGrab: + def __init__(self): + self.sct = mss.mss() + m0 = self.sct.monitors[0] + self.monitor = {'top': 0, 'left': 0, 'width': m0['width'] / 2, 'height': m0['height'] / 2} + + def read(self): + img = np.array(self.sct.grab(self.monitor)) + nf = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR) + return (True, nf) + + def isOpened(self): + return True + def release(self): + return True + + +# constants +WINDOW_NAME = "PartDistillation-Demo" + +def setup_object_cfg(args): + cfg = get_cfg() + if args.cpu: + cfg.MODEL.DEVICE="cpu" + add_centernet_config(cfg) + add_detic_config(cfg) + cfg.merge_from_file(args.object_config_file) + cfg.merge_from_list(args.opts) + # Set score_threshold for builtin models + cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold + cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'rand' # load later + cfg.INPUT.MIN_SIZE_TEST = args.min_image_size + if not args.pred_all_class: + cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True + cfg.freeze() + return cfg + + +def setup_part_cfg(args): + cfg = get_cfg() + if args.cpu: + cfg.MODEL.DEVICE="cpu" + add_deeplab_config(cfg) + add_maskformer2_config(cfg) + add_proposal_learning_config(cfg) + add_custom_datasets_config(cfg) + add_wandb_config(cfg) + cfg.merge_from_file(args.part_config_file) + cfg.merge_from_list(args.opts) + cfg.MODEL.WEIGHTS = args.weight_path + cfg.PROPOSAL_LEARNING.MIN_SCORE = args.part_score_threshold + cfg.PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL = args.non_overlapping + cfg.INPUT.MIN_SIZE_TEST = args.min_image_size + cfg.freeze() + return cfg + + +def get_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs") + parser.add_argument( + "--object-config-file", + default="configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--part-config-file", + default="configs/PartProposalLearning.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--weight-path", default="weights/PartProposalLearning/IN1K+Human/part_proposal_model.pth", type=str) + parser.add_argument("--min-image-size", default=640, type=int) + parser.add_argument("--part-score-threshold", default=0.3, type=float) + parser.add_argument("--non-overlapping", action='store_true', help="Non-overlapping segments.") + parser.add_argument("--dcrf", action='store_true', help="Apply dense-CRF.") + parser.add_argument("--webcam", help="Take inputs from webcam.") + parser.add_argument("--cpu", action='store_true', help="Use CPU only.") + parser.add_argument("--video-input", help="Path to video file.") + parser.add_argument( + "--input", + nargs="+", + help="A list of space separated input images; " + "or a single glob pattern such as 'directory/*.jpg'", + ) + parser.add_argument( + "--output", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + parser.add_argument( + "--vocabulary", + default="lvis", + choices=['lvis', 'openimages', 'objects365', 'coco', 'custom'], + help="", + ) + parser.add_argument( + "--custom_vocabulary", + default="", + help="", + ) + parser.add_argument("--pred_all_class", action='store_true') + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.5, + help="Minimum score for instance predictions to be shown", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + + return parser + + +def test_opencv_video_format(codec, file_ext): + with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: + filename = os.path.join(dir, "test_file" + file_ext) + writer = cv2.VideoWriter( + filename=filename, + fourcc=cv2.VideoWriter_fourcc(*codec), + fps=float(30), + frameSize=(10, 10), + isColor=True, + ) + [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] + writer.release() + if os.path.isfile(filename): + return True + return False + + +if __name__ == "__main__": + mp.set_start_method("spawn", force=True) + args = get_parser().parse_args() + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + object_cfg = setup_object_cfg(args) + part_cfg = setup_part_cfg(args) + + demo = PartVisualizationDemo(object_cfg, part_cfg, args) + + if args.input: + if len(args.input) == 1: + args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" + for path in tqdm.tqdm(args.input, disable=not args.output): + img = read_image(path, format="BGR") + start_time = time.time() + predictions, visualized_output = demo.run_on_image(img) + logger.info( + "{}: {} in {:.2f}s".format( + path, + "detected {} instances".format(len(predictions["instances"])) + if "instances" in predictions + else "finished", + time.time() - start_time, + ) + ) + + if args.output: + if os.path.isdir(args.output): + assert os.path.isdir(args.output), args.output + out_filename = os.path.join(args.output, os.path.basename(path)) + else: + assert len(args.input) == 1, "Please specify a directory with args.output" + out_filename = args.output + visualized_output.save(out_filename) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + break # esc to quit + elif args.webcam: + assert args.input is None, "Cannot have both --input and --webcam!" + assert args.output is None, "output not yet supported with --webcam!" + if args.webcam == "screen": + cam = ScreenGrab() + else: + cam = cv2.VideoCapture(int(args.webcam)) + for vis in tqdm.tqdm(demo.run_on_video(cam)): + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, vis) + if cv2.waitKey(1) == 27: + break # esc to quit + cam.release() + cv2.destroyAllWindows() + elif args.video_input: + video = cv2.VideoCapture(args.video_input) + width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames_per_second = video.get(cv2.CAP_PROP_FPS) + num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + basename = os.path.basename(args.video_input) + codec, file_ext = ( + ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") + ) + if codec == ".mp4v": + warnings.warn("x264 codec not available, switching to mp4v") + if args.output: + if os.path.isdir(args.output): + output_fname = os.path.join(args.output, basename) + output_fname = os.path.splitext(output_fname)[0] + file_ext + else: + output_fname = args.output + assert not os.path.isfile(output_fname), output_fname + output_file = cv2.VideoWriter( + filename=output_fname, + # some installation of opencv may not support x264 (due to its license), + # you can try other format (e.g. MPEG) + fourcc=cv2.VideoWriter_fourcc(*codec), + fps=float(frames_per_second), + frameSize=(width, height), + isColor=True, + ) + assert os.path.isfile(args.video_input) + for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): + if args.output: + output_file.write(vis_frame) + else: + cv2.namedWindow(basename, cv2.WINDOW_NORMAL) + cv2.imshow(basename, vis_frame) + if cv2.waitKey(1) == 27: + break # esc to quit + video.release() + if args.output: + output_file.release() + else: + cv2.destroyAllWindows() \ No newline at end of file diff --git a/part_distillation_train_net.py b/part_distillation_train_net.py index 8ecdd97..18f456a 100644 --- a/part_distillation_train_net.py +++ b/part_distillation_train_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -12,14 +13,14 @@ except: pass -import sys -import os +import sys +import os import torch -import torch.nn as nn -import numpy as np +import torch.nn as nn +import numpy as np import logging import detectron2.utils.comm as comm -import wandb +import wandb sys.path.append('Detic/third_party/CenterNet2') sys.path.append('Detic/third_party/Deformable-DETR') @@ -29,8 +30,8 @@ from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, - build_detection_test_loader, +from detectron2.data import (MetadataCatalog, + build_detection_test_loader, build_detection_train_loader) from detectron2.engine import (default_argument_parser, @@ -42,8 +43,8 @@ from detectron2.utils.comm import is_main_process, synchronize from detectron2.evaluation import verify_results, inference_on_dataset, print_csv_format -from part_distillation import (add_maskformer2_config, - add_wandb_config, +from part_distillation import (add_maskformer2_config, + add_wandb_config, add_custom_datasets_config, add_part_distillation_config) @@ -62,7 +63,7 @@ from part_distillation.evaluation.miou_evaluator import mIOU_Evaluator from part_distillation.evaluation.miou_matcher import mIOU_Matcher - + from base_trainer import BaseTrainer, maybe_dp, get_mode @@ -71,17 +72,17 @@ class Trainer(BaseTrainer): @classmethod def build_evaluator(cls, cfg, dataset_name): if "match" in dataset_name: - return mIOU_Matcher(dataset_name, + return mIOU_Matcher(dataset_name, num_classes=cfg.PART_DISTILLATION.NUM_PART_CLASSES) elif "evaluate" in dataset_name: return mIOU_Evaluator(dataset_name) - + @classmethod def build_test_loader(cls, cfg, dataset_name): if "pascal" in dataset_name: mapper = VOCPartsMapper(cfg, is_train=False) elif "part_imagenet" in dataset_name: - mapper = PartImageNetMapper(cfg, is_train=False) + mapper = PartImageNetMapper(cfg, dataset_name, is_train=False) elif "cityscapes" in dataset_name: mapper = CityscapesPartMapper(cfg, is_train=False) elif "save_labels" in dataset_name: @@ -105,18 +106,19 @@ def test(cls, cfg, model): mode = get_mode(dataset_name) logger.info("Starting {} mode on {}.".format(mode, dataset_name)) maybe_dp(model).register_metadata(dataset_name) - maybe_dp(model).mode = mode + maybe_dp(model).mode = mode data_loader = cls.build_test_loader(cfg, dataset_name) evaluator = cls.build_evaluator(cfg, dataset_name) results_i = inference_on_dataset(model, data_loader, evaluator) - + if mode == "match": maybe_dp(model).update_majority_vote_mapping(results_i) logger.info("Majority vote result:\n{}".format(results_i)) - continue - maybe_dp(model).mode = "" # reset mode. - + continue + maybe_dp(model).register_metadata(cfg.DATASETS.TRAIN[0]) # reset to training dataset. + maybe_dp(model).mode = "" # reset mode. + results.update(results_i) if comm.is_main_process(): assert isinstance(results_i, dict), \ @@ -125,15 +127,18 @@ def test(cls, cfg, model): print_csv_format(results_i) comm.synchronize() + # add dataset name + results = {dataset_name + "_" + k: v for k, v in results.items()} + if len(results) == 1: results = list(results.values())[0] - + comm.synchronize() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.log(results) return results - + def setup(args): cfg = get_cfg() @@ -150,7 +155,7 @@ def setup(args): # Setup logger for "mask_former" module setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="part_distillation") - + # for part-imagenet mapping. register_imagenet("imagenet_1k_meta_train", "train", partitioned_imagenet=False) @@ -168,7 +173,7 @@ def setup(args): for dataset_name in cfg.DATASETS.TEST: if "part_imagenet" in dataset_name: - register_part_imagenet(name=dataset_name, + register_part_imagenet(name=dataset_name, images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], @@ -184,7 +189,7 @@ def setup(args): path_only=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY, debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, ) - + elif "pascal" in dataset_name: register_pascal_parts( name=dataset_name, @@ -201,7 +206,7 @@ def setup(args): cfg.PART_DISTILLATION.DATASET_PATH, "train", partitioned_imagenet=bool(cfg.PART_DISTILLATION.TOTAL_PARTITIONS > 0), - total_partitions=cfg.PART_DISTILLATION.TOTAL_PARTITIONS, + total_partitions=cfg.PART_DISTILLATION.TOTAL_PARTITIONS, partition_index=cfg.PART_DISTILLATION.PARTITION_INDEX, dataset_path_list=cfg.PART_DISTILLATION.DATASET_PATH_LIST, filtered_code_path_list=cfg.PART_DISTILLATION.FILTERED_CODE_PATH_LIST, @@ -218,7 +223,7 @@ def setup(args): def main(args): cfg = setup(args) if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - run_name = cfg.WANDB.RUN_NAME + run_name = cfg.WANDB.RUN_NAME wandb.init(project=cfg.WANDB.PROJECT, sync_tensorboard=True, name=run_name, group=cfg.WANDB.GROUP, config=cfg.PART_DISTILLATION, dir=cfg.OUTPUT_DIR) @@ -233,13 +238,13 @@ def main(args): if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.finish() return res - + trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) res = trainer.train() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - wandb.finish() - return res + wandb.finish() + return res if __name__ == "__main__": diff --git a/part_proposal_train_net.py b/part_proposal_train_net.py index 2759595..9b0f6b0 100644 --- a/part_proposal_train_net.py +++ b/part_proposal_train_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -12,14 +13,14 @@ except: pass -import sys -import os +import sys +import os import torch -import torch.nn as nn -import numpy as np +import torch.nn as nn +import numpy as np import logging import detectron2.utils.comm as comm -import wandb +import wandb sys.path.append('Detic/third_party/CenterNet2') sys.path.append('Detic/third_party/Deformable-DETR') @@ -29,10 +30,10 @@ from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, - build_detection_test_loader, +from detectron2.data import (MetadataCatalog, + DatasetCatalog, + build_detection_test_loader, build_detection_train_loader) - from detectron2.engine import (default_argument_parser, default_setup, launch) @@ -42,9 +43,9 @@ from detectron2.utils.comm import is_main_process, synchronize from detectron2.evaluation import verify_results, inference_on_dataset, print_csv_format -from part_distillation import (add_maskformer2_config, - add_wandb_config, - add_proposal_learning_config, +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_proposal_learning_config, add_custom_datasets_config) from part_distillation.data.datasets.register_pascal_parts import register_pascal_parts @@ -71,7 +72,7 @@ def build_evaluator(self, *args, **kwargs): @classmethod def build_train_loader(self, cfg): mapper = ProposalDatasetMapper(cfg, base_size=cfg.CUSTOM_DATASETS.BASE_SIZE) - + return build_detection_train_loader(cfg, mapper=mapper) @@ -80,7 +81,7 @@ def build_test_loader(self, cfg, dataset_name): if "pascal" in dataset_name: mapper = VOCPartsMapper(cfg, is_train=False) elif "part_imagenet" in dataset_name: - mapper = PartImageNetMapper(cfg, is_train=False) + mapper = PartImageNetMapper(cfg, dataset_name, is_train=False) elif "cityscapes" in dataset_name: mapper = CityscapesPartMapper(cfg, is_train=False) @@ -105,7 +106,7 @@ def test(cls, cfg, model): logger.info("Changing result key names for postprocess: {}.".format(postprocess_type)) results_i = {postprocess_type + "_" + k:v for k, v in results_i.items()} - maybe_dp(model).reset_postprocess_type(cfg.PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL, + maybe_dp(model).reset_postprocess_type(cfg.PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL, cfg.PROPOSAL_LEARNING.MIN_SCORE) results.update(results_i) @@ -115,7 +116,7 @@ def test(cls, cfg, model): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) comm.synchronize() - + if len(results) == 1: results = list(results.values())[0] @@ -124,7 +125,7 @@ def test(cls, cfg, model): wandb.log(results) return results - + def setup(args): """ @@ -142,9 +143,9 @@ def setup(args): cfg.freeze() default_setup(cfg, args) - # Setup logger + # Setup logger setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="part_distillation") - + # register dataset register_imagenet_with_proposals(cfg.DATASETS.TRAIN[0], cfg.PROPOSAL_LEARNING.DATASET_PATH, @@ -156,37 +157,38 @@ def setup(args): path_only=cfg.PROPOSAL_LEARNING.PATH_ONLY, debug=cfg.PROPOSAL_LEARNING.DEBUG, ) - + for dataset_name in cfg.DATASETS.TEST: - if "part_imagenet" in dataset_name: - register_part_imagenet(name=dataset_name, - images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, - annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, - split=dataset_name.split('_')[-1], - debug=cfg.CUSTOM_DATASETS.PART_IMAGENET.DEBUG, - ) - - elif "cityscapes" in dataset_name: - register_cityscapes_part(name=dataset_name, - images_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.IMAGES_DIRNAME, - annotations_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.ANNOTATIONS_DIRNAME, - split=dataset_name.split('_')[-1], - path_only=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY, - debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, - ) - - elif "pascal" in dataset_name: - register_pascal_parts( - name=dataset_name, - images_dirname=cfg.CUSTOM_DATASETS.PASCAL_PARTS.IMAGES_DIRNAME, - annotations_dirname=cfg.CUSTOM_DATASETS.PASCAL_PARTS.ANNOTATIONS_DIRNAME, - split=dataset_name.split('_')[-1], - year=2012, # Fixed. - subset_class_names=cfg.CUSTOM_DATASETS.PASCAL_PARTS.SUBSET_CLASS_NAMES, - debug=cfg.CUSTOM_DATASETS.PASCAL_PARTS.DEBUG, + if dataset_name not in DatasetCatalog.list(): + if "part_imagenet" in dataset_name: + register_part_imagenet(name=dataset_name, + images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, + annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, + split=dataset_name.split('_')[-1], + debug=cfg.CUSTOM_DATASETS.PART_IMAGENET.DEBUG, ) - else: - raise ValueError("{} not supported.".format(dataset_name)) + + elif "cityscapes" in dataset_name: + register_cityscapes_part(name=dataset_name, + images_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.IMAGES_DIRNAME, + annotations_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.ANNOTATIONS_DIRNAME, + split=dataset_name.split('_')[-1], + path_only=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY, + debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, + ) + + elif "pascal" in dataset_name: + register_pascal_parts( + name=dataset_name, + images_dirname=cfg.CUSTOM_DATASETS.PASCAL_PARTS.IMAGES_DIRNAME, + annotations_dirname=cfg.CUSTOM_DATASETS.PASCAL_PARTS.ANNOTATIONS_DIRNAME, + split=dataset_name.split('_')[-1], + year=2012, # Fixed. + subset_class_names=cfg.CUSTOM_DATASETS.PASCAL_PARTS.SUBSET_CLASS_NAMES, + debug=cfg.CUSTOM_DATASETS.PASCAL_PARTS.DEBUG, + ) + else: + raise ValueError("{} not supported.".format(dataset_name)) return cfg @@ -194,7 +196,7 @@ def setup(args): def main(args): cfg = setup(args) if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - run_name = cfg.WANDB.RUN_NAME + run_name = cfg.WANDB.RUN_NAME if not os.path.exists(cfg.VIS_OUTPUT_DIR): os.makedirs(cfg.VIS_OUTPUT_DIR) wandb.init(project=cfg.WANDB.PROJECT, sync_tensorboard=True, name=run_name, @@ -217,7 +219,7 @@ def main(args): res = trainer.train() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.finish() - return res + return res if __name__ == "__main__": diff --git a/part_ranking_train_net.py b/part_ranking_train_net.py index 0a2521e..ee1cec4 100644 --- a/part_ranking_train_net.py +++ b/part_ranking_train_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -15,11 +16,11 @@ import copy import logging import os -import sys -import wandb +import sys +import wandb import torch -import torch.nn as nn -import numpy as np +import torch.nn as nn +import numpy as np import detectron2.utils.comm as comm sys.path.append('Detic/third_party/CenterNet2') @@ -30,8 +31,8 @@ from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, - build_detection_test_loader, +from detectron2.data import (MetadataCatalog, + build_detection_test_loader, build_detection_train_loader) from detectron2.engine import (DefaultTrainer, @@ -44,9 +45,9 @@ from detectron2.utils.comm import is_main_process, synchronize from detectron2.evaluation import verify_results, inference_on_dataset, print_csv_format -from part_distillation import (add_maskformer2_config, - add_wandb_config, - add_part_ranking_config, +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_part_ranking_config, add_custom_datasets_config) from part_distillation.data.dataset_mappers.proposal_dataset_mapper import ProposalDatasetMapper @@ -73,12 +74,12 @@ class Trainer(DefaultTrainer): @classmethod def build_evaluator(self, cfg, dataset_name): - if "pre_labeling" in dataset_name: + if "pre_labeling" in dataset_name: return ClusteringModule(num_clusters=cfg.PART_RANKING.NUM_CLUSTERS) elif "post_labeling" in dataset_name: return NullEvaluator() elif "match" in dataset_name: - return mIOU_Matcher(dataset_name, + return mIOU_Matcher(dataset_name, num_classes=cfg.PART_RANKING.NUM_CLUSTERS) elif "evaluate" in dataset_name: return mIOU_Evaluator(dataset_name) @@ -88,10 +89,10 @@ def build_test_loader(self, cfg, dataset_name): if "pascal" in dataset_name: mapper = VOCPartsMapper(cfg, is_train=False) elif "part_imagenet" in dataset_name: - mapper = PartImageNetMapper(cfg, is_train=False) + mapper = PartImageNetMapper(cfg, dataset_name, is_train=False) elif "cityscapes" in dataset_name: mapper = CityscapesPartMapper(cfg, is_train=False) - elif "imagenet" in dataset_name: + elif "imagenet" in dataset_name: class_code_to_class_index = MetadataCatalog.get(dataset_name).class_code_to_class_id mapper = ImagenetPartRankingDatasetMapper(cfg, class_code_to_class_index) @@ -103,7 +104,7 @@ def test(cls, cfg, model): logger = logging.getLogger("part_distillation") results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): - + # set mode mode = get_mode(dataset_name) maybe_dp(model).mode = mode @@ -113,16 +114,16 @@ def test(cls, cfg, model): data_loader = cls.build_test_loader(cfg, dataset_name) evaluator = cls.build_evaluator(cfg, dataset_name) results_i = inference_on_dataset(model, data_loader, evaluator) - + if mode == "cluster": - maybe_dp(model).register_classifier(results_i) + maybe_dp(model).register_classifier(results_i) logger.info("Cluster centroids are registered as classifiers ({} classes).".format(len(results_i))) - continue + continue elif mode == "match": maybe_dp(model).update_majority_vote_mapping(results_i) logger.info("Majority vote result:\n{}".format(results_i)) - continue - + continue + results.update(results_i) if comm.is_main_process(): assert isinstance(results_i, dict), \ @@ -133,13 +134,13 @@ def test(cls, cfg, model): if len(results) == 1: results = list(results.values())[0] - + comm.synchronize() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.log(results) return results - + def setup(args): """ @@ -163,7 +164,7 @@ def setup(args): # register dataset for dataset_name in cfg.DATASETS.TEST: if "part_imagenet" in dataset_name: - register_part_imagenet(name=dataset_name, + register_part_imagenet(name=dataset_name, images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], @@ -175,10 +176,11 @@ def setup(args): images_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], + for_segmentation=True, path_only=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY, debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, ) - + elif "pascal" in dataset_name: register_pascal_parts( name=dataset_name, @@ -186,6 +188,7 @@ def setup(args): annotations_dirname=cfg.CUSTOM_DATASETS.PASCAL_PARTS.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], year=2012, # Fixed. + for_segmentation=True, subset_class_names=cfg.CUSTOM_DATASETS.PASCAL_PARTS.SUBSET_CLASS_NAMES, debug=cfg.CUSTOM_DATASETS.PASCAL_PARTS.DEBUG, ) @@ -196,7 +199,7 @@ def setup(args): "train", cfg.PART_RANKING.MIN_OBJECT_AREA_RATIO, partitioned_imagenet=bool(cfg.PART_RANKING.TOTAL_PARTITIONS > 0), - total_partitions=cfg.PART_RANKING.TOTAL_PARTITIONS, + total_partitions=cfg.PART_RANKING.TOTAL_PARTITIONS, partition_index=cfg.PART_RANKING.PARTITION_INDEX, dataset_path_list=cfg.PART_RANKING.DATASET_PATH_LIST, filtered_code_path_list=cfg.PART_RANKING.FILTERED_CODE_PATH_LIST, @@ -211,7 +214,7 @@ def setup(args): def main(args): cfg = setup(args) if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - run_name = cfg.WANDB.RUN_NAME + run_name = cfg.WANDB.RUN_NAME wandb.init(project=cfg.WANDB.PROJECT, sync_tensorboard=True, name=run_name, group=cfg.WANDB.GROUP, config=cfg.PART_RANKING, dir=cfg.OUTPUT_DIR) @@ -227,7 +230,7 @@ def main(args): wandb.finish() return res - + if __name__ == "__main__": args = default_argument_parser().parse_args() diff --git a/part_segment_demo.py b/part_segment_demo.py new file mode 100644 index 0000000..a8d8224 --- /dev/null +++ b/part_segment_demo.py @@ -0,0 +1,278 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# From https://github.com/facebookresearch/Detic/blob/main/demo.py . +# Modified by Jang Hyun Cho. + +import argparse +import glob +import multiprocessing as mp +import numpy as np +import os +import tempfile +import time +import warnings +import cv2 +import tqdm +import sys +import mss + +from detectron2.config import get_cfg +from detectron2.data.detection_utils import read_image +from detectron2.utils.logger import setup_logger +from detectron2.projects.deeplab import add_deeplab_config + +sys.path.append('Detic/third_party/CenterNet2') +from centernet.config import add_centernet_config +from Detic.detic.config import add_detic_config + +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_pixel_grouping_confing, + add_custom_datasets_config) + +from part_distillation.demo.part_segment_predictor import PartVisualizationDemo + +WEIGHT_DICT = {"coco_instance_seg": "weights/mask2former/instance/swinL_i21k_q200_e100.pkl", + "coco_panoptic_seg": "weights/mask2former/panoptic/swinL_i21k_q200_e100.pkl", + "cityscapes_instance_seg": "weights/mask2former/cityscapes/instance/swinL_i21k_q200_e100.pkl", + "cityscapes_panoptic_seg": "weights/mask2former/cityscapes/panoptic/swinL_i21k_q200_e100.pkl", + "imagenet": "weights/backbone/swin_large_patch4_window12_384_22k.pkl", + } + +# Fake a video capture object OpenCV style - half width, half height of first screen using MSS +class ScreenGrab: + def __init__(self): + self.sct = mss.mss() + m0 = self.sct.monitors[0] + self.monitor = {'top': 0, 'left': 0, 'width': m0['width'] / 2, 'height': m0['height'] / 2} + + def read(self): + img = np.array(self.sct.grab(self.monitor)) + nf = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR) + return (True, nf) + + def isOpened(self): + return True + def release(self): + return True + + +# constants +WINDOW_NAME = "PartSegment-Demo" + +def setup_object_cfg(args): + cfg = get_cfg() + if args.cpu: + cfg.MODEL.DEVICE="cpu" + add_centernet_config(cfg) + add_detic_config(cfg) + cfg.merge_from_file(args.object_config_file) + cfg.merge_from_list(args.opts) + # Set score_threshold for builtin models + cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold + cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'rand' # load later + cfg.INPUT.MIN_SIZE_TEST = args.min_image_size + if not args.pred_all_class: + cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True + cfg.freeze() + return cfg + + +def setup_part_cfg(args): + cfg = get_cfg() + if args.cpu: + cfg.MODEL.DEVICE="cpu" + add_deeplab_config(cfg) + add_maskformer2_config(cfg) + add_pixel_grouping_confing(cfg) + add_custom_datasets_config(cfg) + add_wandb_config(cfg) + cfg.merge_from_file(args.part_config_file) + cfg.merge_from_list(args.opts) + cfg.INPUT.MIN_SIZE_TEST = args.min_image_size + cfg.PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS = args.k + cfg.MODEL.WEIGHTS = WEIGHT_DICT[args.weight_name] + cfg.freeze() + return cfg + +def get_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs") + parser.add_argument( + "--object-config-file", + default="configs/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--part-config-file", + default="configs/PixelGrouping.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--min-image-size", default=640, type=int) + parser.add_argument("--k", default=4, type=int) + parser.add_argument("--dcrf", action='store_true', help="Apply Dense-CRF.") + parser.add_argument("--weight-name", default="instance_seg", type=str) + parser.add_argument("--webcam", help="Take inputs from webcam.") + parser.add_argument("--cpu", action='store_true', help="Use CPU only.") + parser.add_argument("--video-input", help="Path to video file.") + parser.add_argument( + "--input", + nargs="+", + help="A list of space separated input images; " + "or a single glob pattern such as 'directory/*.jpg'", + ) + parser.add_argument( + "--output", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + parser.add_argument( + "--vocabulary", + default="lvis", + choices=['lvis', 'openimages', 'objects365', 'coco', 'custom'], + help="", + ) + parser.add_argument( + "--custom_vocabulary", + default="", + help="", + ) + parser.add_argument("--pred_all_class", action='store_true') + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.5, + help="Minimum score for instance predictions to be shown", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + + return parser + + +def test_opencv_video_format(codec, file_ext): + with tempfile.TemporaryDirectory(prefix="video_format_test") as dir: + filename = os.path.join(dir, "test_file" + file_ext) + writer = cv2.VideoWriter( + filename=filename, + fourcc=cv2.VideoWriter_fourcc(*codec), + fps=float(30), + frameSize=(10, 10), + isColor=True, + ) + [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)] + writer.release() + if os.path.isfile(filename): + return True + return False + + +if __name__ == "__main__": + mp.set_start_method("spawn", force=True) + args = get_parser().parse_args() + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + object_cfg = setup_object_cfg(args) + part_cfg = setup_part_cfg(args) + + demo = PartVisualizationDemo(object_cfg, part_cfg, args) + + if args.input: + if len(args.input) == 1: + args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" + for path in tqdm.tqdm(args.input, disable=not args.output): + img = read_image(path, format="BGR") + start_time = time.time() + predictions, visualized_output = demo.run_on_image(img) + logger.info( + "{}: {} in {:.2f}s".format( + path, + "detected {} instances".format(len(predictions["instances"])) + if "instances" in predictions + else "finished", + time.time() - start_time, + ) + ) + + if args.output: + if os.path.isdir(args.output): + assert os.path.isdir(args.output), args.output + out_filename = os.path.join(args.output, os.path.basename(path)) + else: + assert len(args.input) == 1, "Please specify a directory with args.output" + out_filename = args.output + visualized_output.save(out_filename) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + break # esc to quit + elif args.webcam: + assert args.input is None, "Cannot have both --input and --webcam!" + assert args.output is None, "output not yet supported with --webcam!" + if args.webcam == "screen": + cam = ScreenGrab() + else: + cam = cv2.VideoCapture(int(args.webcam)) + for vis in tqdm.tqdm(demo.run_on_video(cam)): + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, vis) + if cv2.waitKey(1) == 27: + break # esc to quit + cam.release() + cv2.destroyAllWindows() + elif args.video_input: + video = cv2.VideoCapture(args.video_input) + width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames_per_second = video.get(cv2.CAP_PROP_FPS) + num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + basename = os.path.basename(args.video_input) + codec, file_ext = ( + ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") + ) + if codec == ".mp4v": + warnings.warn("x264 codec not available, switching to mp4v") + if args.output: + if os.path.isdir(args.output): + output_fname = os.path.join(args.output, basename) + output_fname = os.path.splitext(output_fname)[0] + file_ext + else: + output_fname = args.output + assert not os.path.isfile(output_fname), output_fname + output_file = cv2.VideoWriter( + filename=output_fname, + # some installation of opencv may not support x264 (due to its license), + # you can try other format (e.g. MPEG) + fourcc=cv2.VideoWriter_fourcc(*codec), + fps=float(frames_per_second), + frameSize=(width, height), + isColor=True, + ) + assert os.path.isfile(args.video_input) + for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): + if args.output: + output_file.write(vis_frame) + else: + cv2.namedWindow(basename, cv2.WINDOW_NORMAL) + cv2.imshow(basename, vis_frame) + if cv2.waitKey(1) == 27: + break # esc to quit + video.release() + if args.output: + output_file.release() + else: + cv2.destroyAllWindows() \ No newline at end of file diff --git a/pixel_grouping_test_net.py b/pixel_grouping_test_net.py index 4e87e78..8c2544a 100644 --- a/pixel_grouping_test_net.py +++ b/pixel_grouping_test_net.py @@ -3,6 +3,8 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -11,14 +13,17 @@ warnings.filterwarnings('ignore', category=ShapelyDeprecationWarning) except: pass - + +import urllib3 +warnings.simplefilter('ignore', urllib3.exceptions.SubjectAltNameWarning) +warnings.filterwarnings('ignore', category=FutureWarning) import copy import logging import os -import sys +import sys import torch import detectron2.utils.comm as comm -import wandb +import wandb sys.path.append('Detic/third_party/CenterNet2') sys.path.append('Detic/third_party/Deformable-DETR') @@ -26,7 +31,7 @@ from collections import OrderedDict from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, +from detectron2.data import (MetadataCatalog, build_detection_test_loader) from detectron2.engine import (DefaultTrainer, @@ -39,9 +44,9 @@ from detectron2.utils.comm import is_main_process, synchronize from detectron2.evaluation import verify_results, inference_on_dataset, print_csv_format -from part_distillation import (add_maskformer2_config, - add_wandb_config, - add_pixel_grouping_confing, +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_pixel_grouping_confing, add_custom_datasets_config) from part_distillation.data.datasets.register_imagenet import register_imagenet @@ -69,7 +74,7 @@ def test(cls, cfg, model): data_loader = cls.build_test_loader(cfg, dataset_name) evaluator = cls.build_evaluator(cfg, dataset_name) results_i = inference_on_dataset(model, data_loader, evaluator) - + results.update(results_i) if comm.is_main_process(): assert isinstance(results_i, dict), \ @@ -80,13 +85,13 @@ def test(cls, cfg, model): if len(results) == 1: results = list(results.values())[0] - + comm.synchronize() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.log(results) return results - + def setup(args): """ @@ -106,12 +111,12 @@ def setup(args): # Setup logger setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="part_distillation") - # To use the metadata + # To use the metadata register_imagenet("imagenet_1k_meta_train", "train", partitioned_imagenet=False) for dataset_name in cfg.DATASETS.TEST: if "part_imagenet" in dataset_name: - register_part_imagenet(name=dataset_name, + register_part_imagenet(name=dataset_name, images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], @@ -119,14 +124,14 @@ def setup(args): ) else: raise ValueError("{} not supported for pixel grouping evaluation.".format(dataset_name)) - + return cfg def main(args): cfg = setup(args) if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - run_name = cfg.WANDB.RUN_NAME + run_name = cfg.WANDB.RUN_NAME wandb.init(project=cfg.WANDB.PROJECT, sync_tensorboard=True, name=run_name, group=cfg.WANDB.GROUP, config=cfg.PIXEL_GROUPING, dir=cfg.VIS_OUTPUT_DIR) @@ -142,7 +147,7 @@ def main(args): wandb.finish() return res - + if __name__ == "__main__": diff --git a/proposal_generation_net.py b/proposal_generation_net.py index 57e15e2..d07f54a 100644 --- a/proposal_generation_net.py +++ b/proposal_generation_net.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + import warnings warnings.filterwarnings('ignore', category=UserWarning) try: @@ -12,14 +13,17 @@ except: pass +import urllib3 +warnings.simplefilter('ignore', urllib3.exceptions.SubjectAltNameWarning) +warnings.filterwarnings('ignore', category=FutureWarning) import copy import itertools import logging import os -import sys +import sys import torch import detectron2.utils.comm as comm -import wandb +import wandb sys.path.append('Detic/third_party/CenterNet2') sys.path.append('Detic/third_party/Deformable-DETR') @@ -31,9 +35,9 @@ from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, +from detectron2.data import (MetadataCatalog, build_detection_test_loader) - + from detectron2.engine import (DefaultTrainer, default_argument_parser, default_setup, @@ -47,8 +51,7 @@ from part_distillation.data.dataset_mappers.proposal_generation_mapper import ProposalGenerationMapper from part_distillation.evaluation.null_evaluator import NullEvaluator from part_distillation.data.datasets.register_imagenet import register_imagenet - - + class Trainer(DefaultTrainer): @classmethod def build_evaluator(self, *args, **kwargs): @@ -58,11 +61,11 @@ def build_evaluator(self, *args, **kwargs): @classmethod def build_test_loader(self, cfg, dataset_name): mapper = ProposalGenerationMapper(cfg) - return build_detection_test_loader(cfg, dataset_name, - batch_size=cfg.PROPOSAL_GENERATION.BATCH_SIZE, + return build_detection_test_loader(cfg, dataset_name, + batch_size=cfg.PROPOSAL_GENERATION.BATCH_SIZE, mapper=mapper) - + @classmethod def test(self, cfg, model, evaluators=None): results = super().test(cfg, model, evaluators) @@ -70,7 +73,7 @@ def test(self, cfg, model, evaluators=None): if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.log(results) return results - + def setup(args): """ @@ -87,17 +90,22 @@ def setup(args): cfg.freeze() default_setup(cfg, args) - # Setup logger + # Setup logger setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="part_distillation") dataset_name_dir = cfg.PROPOSAL_GENERATION.DATASET_NAME if not cfg.PROPOSAL_GENERATION.DEBUG else "debug" - save_path = "pseudo_labels/part_labels/proposal_generation/{}/{}/{}/{}_{}_norm_{}/"\ - .format(dataset_name_dir, - cfg.PROPOSAL_GENERATION.OBJECT_MASK_TYPE, + detic_labeling_mode = cfg.PROPOSAL_GENERATION.DETIC_LABELING_MODE + root_folder_name = cfg.PROPOSAL_GENERATION.ROOT_FOLDER_NAME + save_path = "{}/part_labels/proposal_generation/{}/{}/{}/{}/{}_{}_norm_{}/"\ + .format( + root_folder_name, + detic_labeling_mode, + dataset_name_dir, + cfg.PROPOSAL_GENERATION.OBJECT_MASK_TYPE, "_".join(cfg.PROPOSAL_GENERATION.BACKBONE_FEATURE_KEY_LIST), - cfg.PROPOSAL_GENERATION.DISTANCE_METRIC, + cfg.PROPOSAL_GENERATION.DISTANCE_METRIC, cfg.PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS, cfg.PROPOSAL_GENERATION.FEATURE_NORMALIZE) - + # register dataset register_imagenet( cfg.PROPOSAL_GENERATION.DATASET_NAME, diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ac85bad..0000000 --- a/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -cython -scipy -shapely -timm -h5py -submitit -scikit-image -scikit-learn -pycocotools -wandb -panoptic_parts \ No newline at end of file diff --git a/save_label_visualization.py b/save_label_visualization.py new file mode 100644 index 0000000..a3bb6bc --- /dev/null +++ b/save_label_visualization.py @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +""" +This file is to sanity-check the saved visualization. +""" + +import os +import torch +import numpy as np +from pycocotools import mask as coco_mask +from detectron2.structures import BoxMode +from detectron2.data import detection_utils as utils +from detectron2.data import transforms as T +from detectron2.structures import BitMasks, Instances + +rootpath = "pseudo_labels/part_labels/processed_proposals/human-only-0.3/imagenet_1k_train/detic/res3_res4/dot_4_norm_False/" +augs = [T.ResizeScale(min_scale=1.0, max_scale=1.0, target_height=640, target_width=640)] +targetpath = "visualization/" +if __name__ == "__main__": + cname_list = os.listdir(rootpath) + path_list = [os.path.join(rootpath, c, f) for c in cname_list for f in os.listdir(os.path.join(rootpath, c))] + np.random.shuffle(path_list) + path_list = path_list[:100] + + for path in path_list: + data = torch.load(path, "cpu") + image = utils.read_image(data["file_path"], format="RGB") + image = T.apply_transform_gens(augs, T.AugInput(image))[0].image + data['image'] = image + + torch.save(data, os.path.join(targetpath, data['file_name'])) + diff --git a/sh_files/dcrf/run.sh b/sh_files/dcrf/run.sh deleted file mode 100755 index cd242de..0000000 --- a/sh_files/dcrf/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -TOT_IDS=90 -for ID in 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 -do - python3 continuously_postprocess_dcrf.py \ - --parallel_job_id $ID \ - --num_parallel_jobs $TOT_IDS \ - --res "res4" \ - --num_k 4 & -done diff --git a/sh_files/detic/debug.sh b/sh_files/detic/debug.sh deleted file mode 100755 index b195d9b..0000000 --- a/sh_files/detic/debug.sh +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -DEBUG_MODE=True -DATASET_NAME=imagenet_1k -SPLIT=train -N_IMS=2 -ID=1 -TOT_IDS=50 -python3 "detic_labeling_net.py" \ ---config-file configs/detic/Detic_Labeling.yaml \ ---num-gpus 1 \ ---num-machines 1 \ ---eval-only \ -OUTPUT_DIR "output/detic_22k/" \ -PROPOSAL_GENERATION.BATCH_SIZE ${N_IMS} \ -PROPOSAL_GENERATION.DEBUG ${DEBUG_MODE} \ -PROPOSAL_GENERATION.DATASET_NAME ${DATASET_NAME}"_"${SPLIT} \ -PROPOSAL_GENERATION.PARTITION_INDEX ${ID} \ -PROPOSAL_GENERATION.TOTAL_PARTITIONS ${TOT_IDS} \ -INPUT.IMAGE_SIZE 640 \ -DATASETS.TEST "('${DATASET_NAME}_${SPLIT}',)" \ -TEST.DETECTIONS_PER_IMAGE 1000 diff --git a/sh_files/detic/run.sh b/sh_files/detic/run.sh deleted file mode 100755 index 6bcc8f4..0000000 --- a/sh_files/detic/run.sh +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -DEBUG_MODE=False -DATASET_NAME=imagenet_22k -SPLIT=train -N_IMS=2 -TOT_IDS=60 -for ID in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 -do - python3 "multi_node_train_net.py" \ - --config-file configs/detic/Detic_Labeling.yaml \ - --num-gpus 8 -p "learnaccel" \ - --num-machines 1 \ - --eval-only \ - --name "detic_labeling_${ID}" \ - --target "detic_labeling_net.py" \ - --job-dir "output/detic_22k/detic_labeling_${ID}/" \ - OUTPUT_DIR "output/detic_22k/" \ - PROPOSAL_GENERATION.BATCH_SIZE ${N_IMS} \ - PROPOSAL_GENERATION.DEBUG ${DEBUG_MODE} \ - PROPOSAL_GENERATION.DATASET_NAME ${DATASET_NAME}"_"${SPLIT} \ - PROPOSAL_GENERATION.PARTITION_INDEX ${ID} \ - PROPOSAL_GENERATION.TOTAL_PARTITIONS ${TOT_IDS} \ - INPUT.IMAGE_SIZE 640 \ - DATASETS.TEST "('${DATASET_NAME}_${SPLIT}',)" \ - TEST.DETECTIONS_PER_IMAGE 1000 -done - - -# DEBUG_MODE=True -# DATASET_NAME=imagenet_1k -# SPLIT=train -# N_IMS=2 -# ID=10 -# TOT_IDS=10 -# python3 "detic_labeling_net.py" \ -# --config-file configs/detic/Detic_Labeling.yaml \ -# --num-gpus 2 \ -# --num-machines 1 \ -# --eval-only \ -# OUTPUT_DIR "output/detic/" \ -# PROPOSAL_GENERATION.BATCH_SIZE ${N_IMS} \ -# PROPOSAL_GENERATION.DEBUG ${DEBUG_MODE} \ -# PROPOSAL_GENERATION.DATASET_NAME ${DATASET_NAME}"_"${SPLIT} \ -# PROPOSAL_GENERATION.PARALLEL_JOB_ID ${ID} \ -# PROPOSAL_GENERATION.NUM_PARALLEL_JOBS ${TOT_IDS} \ -# INPUT.IMAGE_SIZE 640 \ -# TEST.DETECTIONS_PER_IMAGE 1000 diff --git a/sh_files/fewshot_learning/prop/cityscapes.sh b/sh_files/fewshot_learning/prop/cityscapes.sh deleted file mode 100755 index 596b825..0000000 --- a/sh_files/fewshot_learning/prop/cityscapes.sh +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("cityscapes_part_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"cityscapes_part_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Prop_Fewshot_Learning" -grp_name="Cityscapes_Part" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "cp_prop_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -MODEL.WEIGHTS ${model_weights} \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/fewshot_learning/prop/part_imagenet.sh b/sh_files/fewshot_learning/prop/part_imagenet.sh deleted file mode 100755 index da6b01b..0000000 --- a/sh_files/fewshot_learning/prop/part_imagenet.sh +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("part_imagenet_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"part_imagenet_valtest",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -importance_sampling_ratio=0.75 - -exp_name="Prop_Fewshot_Learning" -grp_name="PartImageNet" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "pi_prop_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/fewshot_learning/prop/pascal.sh b/sh_files/fewshot_learning/prop/pascal.sh deleted file mode 100755 index 3a74446..0000000 --- a/sh_files/fewshot_learning/prop/pascal.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("pascal_parts_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"pascal_parts_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - - -exp_name="Prop_Fewshot_Learning" -grp_name="Pascal_Parts" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "pp_prop_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/fewshot_learning/semseg/cityscapes.sh b/sh_files/fewshot_learning/semseg/cityscapes.sh deleted file mode 100755 index b06f2d5..0000000 --- a/sh_files/fewshot_learning/semseg/cityscapes.sh +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("cityscapes_part_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"cityscapes_part_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Semseg_Fewshot_Learning" -grp_name="Cityscapes_Part" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "cp_semseg_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -MODEL.WEIGHTS ${model_weights} \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/fewshot_learning/semseg/part_imagenet.sh b/sh_files/fewshot_learning/semseg/part_imagenet.sh deleted file mode 100755 index b62574a..0000000 --- a/sh_files/fewshot_learning/semseg/part_imagenet.sh +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("part_imagenet_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"part_imagenet_valtest",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -importance_sampling_ratio=0.75 - -exp_name="Semseg_Fewshot_Learning" -grp_name="PartImageNet" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "pi_semseg_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/fewshot_learning/semseg/pascal.sh b/sh_files/fewshot_learning/semseg/pascal.sh deleted file mode 100755 index 8e5d60c..0000000 --- a/sh_files/fewshot_learning/semseg/pascal.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("pascal_parts_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"pascal_parts_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - - -exp_name="Semseg_Fewshot_Learning" -grp_name="Pascal_Parts" - -model_weights="" # change this to the trained model path. -percent=100 # change this to desired percentage. -comment="coco_m2f_${percent}" # if model_weight is empty, it will initialize m2f coco-instance segmentation. - - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 1 \ ---name "pp_semseg_${percent}" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -FEWSHOT_LEARNING.LABEL_PERCENTAGE ${percent} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/part_distillation_training/train.sh b/sh_files/part_distillation_training/train.sh deleted file mode 100755 index b0310e8..0000000 --- a/sh_files/part_distillation_training/train.sh +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=256 -aug_list='["crop","scale","flip"]' -freeze_keys='["backbone","encoder"]' -LR="0.0001" -MIN_RATIO='0.05' -MIN_OBJECT_RATIO='0.05' -MIN_SCORE='-1.0' -MAX_ITER='120000' -PER_PIXEL="True" - -train_dataset='("imagenet_22k_train",)' -val_dataset='("part_imagenet_match_valtest","part_imagenet_evaluate_valtest",)' - -oversample_ratio=3.0 -importance_sampling_ratio=0.0 - -exp_name="Part_Distillation_Train" -num_obj_classes=22000 -num_part_classes=8 -pseudo_ann_path="pseudo_labels_saved/part_labels/part_masks_with_class/imagenet_22k_train/" -grp_name="IN22K" -comment="lr_${LR}" - -python3 "multi_node_train_net.py" \ ---config-file configs/part_distillation/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 8 \ ---use-volta32 \ ---name ${comment} \ ---target "part_distillation_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 1000 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(100000, 110000)' \ -CUSTOM_DATASETS.BASE_SIZE 640 \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.MIN_OBJECT_AREA_RATIO ${MIN_OBJECT_RATIO} \ -CUSTOM_DATASETS.MIN_AREA_RATIO ${MIN_RATIO} \ -CUSTOM_DATASETS.MIN_SCORE ${MIN_SCORE} \ -CUSTOM_DATASETS.DATASET_PATH ${pseudo_ann_path} \ -CUSTOM_DATASETS.PATH_ONLY True \ -PART_DISTILLATION.NUM_OBJECT_CLASSES ${num_obj_classes} \ -PART_DISTILLATION.NUM_PART_CLASSES ${num_part_classes} \ -PART_DISTILLATION.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -PART_DISTILLATION.SET_IMAGE_SQUARE True \ -CUSTOM_DATASETS.DEBUG False diff --git a/sh_files/part_distillation_training/train_single.sh b/sh_files/part_distillation_training/train_single.sh deleted file mode 100755 index 809c2d3..0000000 --- a/sh_files/part_distillation_training/train_single.sh +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -aug_list='["crop","scale","flip"]' -freeze_keys='["backbone","encoder"]' -LR="0.0001" -MIN_RATIO='0.001' -MIN_OBJECT_RATIO='0.001' -MIN_SCORE='-1.0' -MAX_ITER='80000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("imagenet_22k_train",)' -val_dataset='("pascal_match_val","pascal_evaluate_val",)' -# val_dataset='("part_imagenet_match_val","part_imagenet_evaluate_val",)' -# val_dataset='("cityscapes_part_match_val","cityscapes_part_evaluate_val",)' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.0 - -exp_name="Part_Distillation_Train" -num_obj_classes=22000 -num_part_classes=8 -pseudo_ann_path="pseudo_labels_saved/part_labels/part_masks_with_class/imagenet_22k_train/" -# grp_name="1k_detic_prediction" -grp_name="debug" -comment="lr_${LR}" - -python3 "part_distillation_train_net.py" \ ---config-file configs/part_distillation/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 20 \ -WANDB.VIS_PERIOD_TEST 20 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(70000, 75000)' \ -CUSTOM_DATASETS.BASE_SIZE 640 \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -PART_DISTILLATION.MIN_OBJECT_AREA_RATIO ${MIN_OBJECT_RATIO} \ -PART_DISTILLATION.MIN_AREA_RATIO ${MIN_RATIO} \ -PART_DISTILLATION.MIN_SCORE ${MIN_SCORE} \ -PART_DISTILLATION.DATASET_PATH ${pseudo_ann_path} \ -PART_DISTILLATION.PATH_ONLY True \ -PART_DISTILLATION.NUM_OBJECT_CLASSES ${num_obj_classes} \ -PART_DISTILLATION.NUM_PART_CLASSES ${num_part_classes} \ -PART_DISTILLATION.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -PART_DISTILLATION.SET_IMAGE_SQUARE True \ -PART_DISTILLATION.DEBUG True diff --git a/sh_files/part_ranking/run.sh b/sh_files/part_ranking/run.sh deleted file mode 100755 index 5441168..0000000 --- a/sh_files/part_ranking/run.sh +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -train_dataset='("imagenet_22k_train",)' -val_dataset='("imagenet_22k_pre_labeling_train","imagenet_22k_post_labeling_train",)' - - -# Control factors -PER_PIXEL_CLUSTERING="True" -PER_PIXEL_LABELING="True" -MIN_RATIO1='0.05' -MIN_SCORE1='0.3' -MIN_RATIO2='0.05' -MIN_SCORE2='0.1' -num_pcluster=8 -cls_metric="l2" -prop_key="decoder_output" -FEAT_NORM=True -partition=True -# pid=0 -total_p=50 - - -LR="0.0001" -model_id="0049999" - -exp_name="Part_Ranking" -grp_name="IN21K+COCO" - -# Chnage this -prop_model_type="detic_and_score" -prop_model_name="lr_${LR}" -weight_path="" -obj_mask_type="detic_predictions" -pseudo_ann_path="pseudo_labels/part_labels/imagenet_22k_train/${obj_mask_type}/" - -for pid in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 -do - comment="pp_${PER_PIXEL_CLUSTERING}_s_${MIN_SCORE1}_${pid}" - - python3 "multi_node_train_net.py" \ - --config-file configs/part_ranking/swinL_IN21K_384_mask2former.yaml \ - --num-gpus 8 -p "learnaccel" \ - --num-machines 1 \ - --eval-only \ - --target "part_ranking_train_net.py" \ - --job-dir "output/${exp_name}/${grp_name}/${comment}/" \ - MODEL.WEIGHTS ${weight_path} \ - OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ - DATASETS.TRAIN ${train_dataset} \ - DATASETS.TEST ${val_dataset} \ - WANDB.DISABLE_WANDB False \ - WANDB.RUN_NAME ${comment} \ - WANDB.PROJECT ${exp_name} \ - WANDB.GROUP ${grp_name} \ - WANDB.VIS_PERIOD_TRAIN 200 \ - WANDB.VIS_PERIOD_TEST 20 \ - PART_RANKING.MIN_AREA_RATIO_1 ${MIN_RATIO1} \ - PART_RANKING.MIN_SCORE_1 ${MIN_SCORE1} \ - PART_RANKING.MIN_AREA_RATIO_2 ${MIN_RATIO2} \ - PART_RANKING.MIN_SCORE_2 ${MIN_SCORE2} \ - CUSTOM_DATASETS.USE_MERGED_GT True \ - PART_RANKING.DATASET_PATH ${pseudo_ann_path} \ - PART_RANKING.USE_PER_PIXEL_LABEL_DURING_CLUSTERING ${PER_PIXEL_CLUSTERING} \ - PART_RANKING.USE_PER_PIXEL_LABEL_DURING_LABELING ${PER_PIXEL_LABELING} \ - PART_RANKING.PROPOSAL_KEY ${prop_key} \ - PART_RANKING.CLASSIFIER_METRIC ${cls_metric} \ - PART_RANKING.NUM_CLUSTERS ${num_pcluster} \ - PART_RANKING.APPLY_MASKING_WITH_OBJECT_MASK True \ - PART_RANKING.OBJECT_MASK_TYPE ${obj_mask_type} \ - PART_RANKING.PROPOSAL_MODEL_TYPE ${prop_model_type} \ - PART_RANKING.PROPOSAL_MODEL_NAME ${prop_model_name} \ - PART_RANKING.PROPOSAL_FEATURE_NORM ${FEAT_NORM} \ - PART_RANKING.PARTITION_INDEX ${pid} \ - PART_RANKING.TOTAL_PARTITIONS ${total_p} \ - PART_RANKING.DEBUG False -done diff --git a/sh_files/part_ranking/run_single.sh b/sh_files/part_ranking/run_single.sh deleted file mode 100755 index aa2a4f3..0000000 --- a/sh_files/part_ranking/run_single.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=32 -train_dataset='("imagenet_1k_train",)' -val_dataset='("imagenet_1k_pre_labeling_train","imagenet_1k_post_labeling_train",)' - - -# Control factors -PER_PIXEL_CLUSTERING="True" -PER_PIXEL_LABELING="True" -MIN_RATIO1='0.05' -MIN_SCORE1='0.3' -MIN_RATIO2='0.01' -MIN_SCORE2='0.1' -num_pcluster=8 -cls_metric="l2" -prop_key="decoder_output" -FEAT_NORM=True - -model_id="0049999" - -exp_name="Part_Ranking" -grp_name="IN21K+COCO" - -# Chnage this -prop_model_type="detic_and_score" -prop_model_name="lr_${LR}" -weight_path="weights/proposal_model/lr_0.0001_0039999.pth" -pseudo_ann_path="pseudo_labels_saved/part_labels/proposal_generation/imagenet_22k_train/detic_based/generated_proposals_new_processed/res3_res4/dot_4_norm_False/" -pseudo_ann_path_extra="pseudo_labels_saved/part_labels/proposal_generation/imagenet_1k_train/generated_proposals_processed/score_based/res4/l2_4/" -pid=2 -total_p=50 -comment="pp_${PER_PIXEL_CLUSTERING}_s_${MIN_SCORE1}_${pid}" - -python3 "part_ranking_train_net.py" \ ---config-file configs/part_ranking/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ ---eval-only \ -MODEL.WEIGHTS ${weight_path} \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 20 \ -PART_RANKING.MIN_AREA_RATIO_1 ${MIN_RATIO1} \ -PART_RANKING.MIN_SCORE_1 ${MIN_SCORE1} \ -PART_RANKING.MIN_AREA_RATIO_2 ${MIN_RATIO2} \ -PART_RANKING.MIN_SCORE_2 ${MIN_SCORE2} \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -PART_RANKING.DATASET_PATH ${pseudo_ann_path} \ -PART_RANKING.DATASET_PATH_LIST "('${pseudo_ann_path}','${pseudo_ann_path_extra}',)" \ -PART_RANKING.USE_PER_PIXEL_LABEL_DURING_CLUSTERING ${PER_PIXEL_CLUSTERING} \ -PART_RANKING.USE_PER_PIXEL_LABEL_DURING_LABELING ${PER_PIXEL_LABELING} \ -PART_RANKING.PROPOSAL_KEY ${prop_key} \ -PART_RANKING.CLASSIFIER_METRIC ${cls_metric} \ -PART_RANKING.NUM_CLUSTERS ${num_pcluster} \ -PART_RANKING.APPLY_MASKING_WITH_OBJECT_MASK True \ -PART_RANKING.PROPOSAL_FEATURE_NORM ${FEAT_NORM} \ -PART_RANKING.PARTITION_INDEX ${pid} \ -PART_RANKING.TOTAL_PARTITIONS ${total_p} \ -PART_RANKING.DEBUG True diff --git a/sh_files/pixel_grouping/run.sh b/sh_files/pixel_grouping/run.sh deleted file mode 100755 index f5a78ba..0000000 --- a/sh_files/pixel_grouping/run.sh +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -train_dataset='("imagenet_1k_train",)' -val_dataset='("part_imagenet_valtest",)' -exp_name="PixelGrouping_Evaluation" - -feat_norm=False -metric="dot" -grp_name="swinL_m2f" -comment="debug" -NUM_CLUSTERS=4 -feat_list='["res3","res4"]' - - -python3 "pixel_grouping_test_net.py" \ ---config-file configs/pixel_grouping/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ ---eval-only \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TEST ${val_dataset} \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.DISABLE_WANDB False \ -WANDB.VIS_PERIOD_TEST 30 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -PIXEL_GROUPING.NUM_SUPERPIXEL_CLUSTERS ${NUM_CLUSTERS} \ -PIXEL_GROUPING.DISTANCE_METRIC ${metric} \ -PIXEL_GROUPING.BACKBONE_FEATURE_KEY_LIST ${feat_list} \ -TEST.EVAL_PERIOD 50 \ -CUSTOM_DATASETS.PART_IMAGENET.DEBUG False diff --git a/sh_files/proposal_generation/run.sh b/sh_files/proposal_generation/run.sh deleted file mode 100755 index 3b7b0a5..0000000 --- a/sh_files/proposal_generation/run.sh +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -NUM_CLUSTERS=4 -DEBUG_MODE=False -DATASET_NAME=imagenet_22k -SPLIT=train -metric="dot" -feat_list='["res3","res4"]' -N_IMS=1 -feat_norm=False -TOT_IDS=40 -exp_name="ProposalGeneration" -grp_name="res34_${metric}" -object_mask_path="pseudo_labels_saved/object_labels/imagenet_22k_train/detic_predictions/" -for ID in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 -do - comment="k${NUM_CLUSTERS}i${ID}" - - python3 "multi_node_train_net.py" \ - --config-file configs/proposal_generation/swinL_IN21K_384_mask2former.yaml \ - --num-gpus 8 -p "learnaccel" \ - --num-machines 1 \ - --name ${comment} \ - --target "proposal_generation_net.py" \ - --job-dir "output/${exp_name}/${grp_name}/${comment}/" \ - VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ - OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ - WANDB.DISABLE_WANDB False \ - WANDB.RUN_NAME ${comment} \ - WANDB.PROJECT ${exp_name} \ - WANDB.GROUP ${grp_name} \ - WANDB.VIS_PERIOD_TEST 2000 \ - PROPOSAL_GENERATION.OBJECT_MASK_PATH ${object_mask_path} \ - DATASETS.TEST "('${DATASET_NAME}"_"${SPLIT}',)" \ - MODEL.META_ARCHITECTURE "ProposalGenerationModel" \ - PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS ${NUM_CLUSTERS} \ - PROPOSAL_GENERATION.DATASET_NAME ${DATASET_NAME}"_"${SPLIT} \ - PROPOSAL_GENERATION.PARTITION_INDEX ${ID} \ - PROPOSAL_GENERATION.TOTAL_PARTITIONS ${TOT_IDS} \ - PROPOSAL_GENERATION.OBJECT_MASK_TYPE "detic" \ - PROPOSAL_GENERATION.WITH_GIVEN_MASK True \ - PROPOSAL_GENERATION.DISTANCE_METRIC ${metric} \ - PROPOSAL_GENERATION.BACKBONE_FEATURE_KEY_LIST ${feat_list} \ - PROPOSAL_GENERATION.FEATURE_NORMALIZE ${feat_norm} \ - PROPOSAL_GENERATION.BATCH_SIZE ${N_IMS} \ - PROPOSAL_GENERATION.DEBUG ${DEBUG_MODE} -done diff --git a/sh_files/proposal_generation/run_single.sh b/sh_files/proposal_generation/run_single.sh deleted file mode 100755 index 5bd28ba..0000000 --- a/sh_files/proposal_generation/run_single.sh +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -NUM_CLUSTERS=4 -DEBUG_MODE=False -DATASET_NAME=imagenet_22k -SPLIT=train -metric="dot" -feat_list='["res3","res4"]' -N_IMS=1 -feat_norm=False -TOT_IDS=20 -object_mask_path="pseudo_labels_saved/object_labels/imagenet_22k_train/detic_predictions/" -exp_name="ProposalGeneration" -grp_name="res34_${metric}" -ID=1 - -comment="debug" - -python3 "proposal_generation_net.py" \ ---config-file configs/proposal_generation/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 1 \ ---num-machines 1 \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TEST 2000 \ -DATASETS.TEST "('${DATASET_NAME}"_"${SPLIT}',)" \ -MODEL.META_ARCHITECTURE "ProposalGenerationModel" \ -PROPOSAL_GENERATION.OBJECT_MASK_PATH ${object_mask_path} \ -PROPOSAL_GENERATION.NUM_SUPERPIXEL_CLUSTERS ${NUM_CLUSTERS} \ -PROPOSAL_GENERATION.DATASET_NAME ${DATASET_NAME}"_"${SPLIT} \ -PROPOSAL_GENERATION.PARTITION_INDEX ${ID} \ -PROPOSAL_GENERATION.TOTAL_PARTITIONS ${TOT_IDS} \ -PROPOSAL_GENERATION.OBJECT_MASK_TYPE "detic" \ -PROPOSAL_GENERATION.WITH_GIVEN_MASK True \ -PROPOSAL_GENERATION.DISTANCE_METRIC ${metric} \ -PROPOSAL_GENERATION.BACKBONE_FEATURE_KEY_LIST ${feat_list} \ -PROPOSAL_GENERATION.FEATURE_NORMALIZE ${feat_norm} \ -PROPOSAL_GENERATION.BATCH_SIZE ${N_IMS} \ -PROPOSAL_GENERATION.DEBUG ${DEBUG_MODE} diff --git a/sh_files/proposal_learning/train_multi.sh b/sh_files/proposal_learning/train_multi.sh deleted file mode 100755 index 1830f61..0000000 --- a/sh_files/proposal_learning/train_multi.sh +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=512 -aug_list='["crop","scale","flip"]' -freeze_keys='["backbone","encoder"]' -LR="0.0001" -MIN_OBJ_RATIO='0.05' -MIN_RATIO='0.05' -MIN_SCORE='-1.0' -MAX_ITER='50000' -PER_PIXEL="True" - -train_dataset='("imagenet_22k_train",)' -val_dataset='("pascal_part_val","part_imagenet_valtest","cityscapes_part_val",)' -process_list='("prop","prop","prop",)' -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.0 - -exp_name="Proposal_Learning_Train" -grp_name="IN22K+COCO" - -pseudo_ann_path="pseudo_labels_saved/part_labels/proposal_generation/imagenet_22k_train/detic_based/generated_proposals_new_processed/res3_res4/dot_4_norm_False/" -pseudo_ann_path_extra="pseudo_labels_saved/part_labels/proposal_generation/imagenet_1k_train/generated_proposals_processed/score_based/res4/l2_4/" -filtered_code_path_list='()' -comment="lr_${LR}" - -python3 "multi_node_train_net.py" \ ---config-file configs/proposal_learning/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 8 \ ---resume \ ---use-volta32 \ ---name "prop" \ ---target "part_proposal_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.BASE_SIZE 640 \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 1000 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 10000 \ -PROPOSAL_LEARNING.MIN_OBJECT_AREA_RATIO ${MIN_OBJ_RATIO} \ -PROPOSAL_LEARNING.MIN_AREA_RATIO ${MIN_RATIO} \ -PROPOSAL_LEARNING.MIN_SCORE ${MIN_SCORE} \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -PROPOSAL_LEARNING.DATASET_PATH_LIST "('${pseudo_ann_path}','${pseudo_ann_path_extra}',)" \ -PROPOSAL_LEARNING.DATASET_PATH ${pseudo_ann_path} \ -PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -PROPOSAL_LEARNING.APPLY_MASKING_WITH_OBJECT_MASK True \ -PROPOSAL_LEARNING.FILTERED_CODE_PATH_LIST ${filtered_code_path_list} \ -PROPOSAL_LEARNING.POSTPROCESS_TYPES ${process_list} \ -PROPOSAL_LEARNING.PATH_ONLY True \ -PROPOSAL_LEARNING.DEBUG False diff --git a/sh_files/proposal_learning/train_single.sh b/sh_files/proposal_learning/train_single.sh deleted file mode 100755 index b1c5231..0000000 --- a/sh_files/proposal_learning/train_single.sh +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=8 -aug_list='["crop","scale","flip"]' -freeze_keys='["backbone","encoder"]' -LR="0.0001" -MIN_RATIO='0.01' -MIN_SCORE='-1.0' -MAX_ITER='80000' -PER_PIXEL="True" - -train_dataset='("imagenet_22k_train",)' -# val_dataset='("pascal_part_pre_labeling_val","pascal_part_match_val","pascal_part_miou_val",)' -val_dataset='("pascal_part_val","part_imagenet_valtest","cityscapes_part_val",)' -process_list='("prop","prop","prop",)' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.0 - -exp_name="Proposal_Learning_Train" -grp_name="IN21K+COCO" - -pseudo_ann_path="pseudo_labels_saved/part_labels/proposal_generation/imagenet_22k_train/detic_based/generated_proposals_new_processed/res3_res4/dot_4_norm_False/" -pseudo_ann_path_extra="pseudo_labels_saved/part_labels/proposal_generation/imagenet_1k_train/generated_proposals_processed/score_based/res4/l2_4/" -filtered_code_path_list='("datasets/metadata/imagenet_12k_code_list.pkl",)' -comment="lr_${LR}" - -python3 "part_proposal_train_net.py" \ ---config-file configs/proposal_learning/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.BASE_SIZE 640 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 10 \ -WANDB.VIS_PERIOD_TEST 10 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(60000, 70000)' \ -TEST.EVAL_PERIOD 10 \ -PROPOSAL_LEARNING.MIN_AREA_RATIO ${MIN_RATIO} \ -PROPOSAL_LEARNING.MIN_SCORE ${MIN_SCORE} \ -PROPOSAL_LEARNING.DATASET_PATH_LIST "('${pseudo_ann_path}','${pseudo_ann_path_extra}',)" \ -PROPOSAL_LEARNING.DATASET_PATH ${pseudo_ann_path} \ -PROPOSAL_LEARNING.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -PROPOSAL_LEARNING.APPLY_MASKING_WITH_OBJECT_MASK True \ -PROPOSAL_LEARNING.FILTERED_CODE_PATH_LIST ${filtered_code_path_list} \ -PROPOSAL_LEARNING.POSTPROCESS_TYPES ${process_list} \ -PROPOSAL_LEARNING.PATH_ONLY True \ -PROPOSAL_LEARNING.DEBUG True diff --git a/sh_files/supervised_learning/prop/cityscapes.sh b/sh_files/supervised_learning/prop/cityscapes.sh deleted file mode 100755 index 0da673e..0000000 --- a/sh_files/supervised_learning/prop/cityscapes.sh +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("cityscapes_part_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"cityscapes_part_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Prop_Supervised_Learning" -grp_name="Cityscapes_Part" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "cp_prop" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/prop/part_imagenet.sh b/sh_files/supervised_learning/prop/part_imagenet.sh deleted file mode 100755 index ee7aeb3..0000000 --- a/sh_files/supervised_learning/prop/part_imagenet.sh +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("part_imagenet_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"part_imagenet_valtest",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -importance_sampling_ratio=0.75 - -exp_name="Prop_Supervised_Learning" -grp_name="PartImageNet" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "pi_prop" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/prop/pascal.sh b/sh_files/supervised_learning/prop/pascal.sh deleted file mode 100755 index 67bedba..0000000 --- a/sh_files/supervised_learning/prop/pascal.sh +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("pascal_parts_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"pascal_parts_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Prop_Supervised_Learning" -grp_name="PascalParts" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "pp_prop" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 1 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING True \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/semseg/city_single.sh b/sh_files/supervised_learning/semseg/city_single.sh deleted file mode 100755 index e0c678f..0000000 --- a/sh_files/supervised_learning/semseg/city_single.sh +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=16 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="False" -PER_PIXEL="True" - -train_dataset='("cityscapes_part_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"cityscapes_part_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="PascalParts" -comment="debug" - -python3 "supervised_train_net.py" \ ---config-file configs/supervised_learning/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 20 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 100 \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG True diff --git a/sh_files/supervised_learning/semseg/cityscapes.sh b/sh_files/supervised_learning/semseg/cityscapes.sh deleted file mode 100755 index f2db3f9..0000000 --- a/sh_files/supervised_learning/semseg/cityscapes.sh +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("cityscapes_part_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"cityscapes_part_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="Cityscapes_Part" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "cp_semseg" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/semseg/part_imagenet.sh b/sh_files/supervised_learning/semseg/part_imagenet.sh deleted file mode 100755 index cde7952..0000000 --- a/sh_files/supervised_learning/semseg/part_imagenet.sh +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -PER_PIXEL="True" - -train_dataset='("part_imagenet_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"part_imagenet_valtest",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="PartImageNet" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "pi_semseg" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/semseg/part_imagenet_single.sh b/sh_files/supervised_learning/semseg/part_imagenet_single.sh deleted file mode 100755 index ead7161..0000000 --- a/sh_files/supervised_learning/semseg/part_imagenet_single.sh +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=16 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="False" -PER_PIXEL="True" - -train_dataset='("part_imagenet_train",)' -val_dataset='("part_imagenet_valtest",,)' - -oversample_ratio=3.0 -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="PartImageNet" -comment="debug" - -python3 "supervised_train_net.py" \ ---config-file configs/supervised_learning/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 20 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 10 \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE True \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.PART_IMAGENET.DEBUG True diff --git a/sh_files/supervised_learning/semseg/pascal.sh b/sh_files/supervised_learning/semseg/pascal.sh deleted file mode 100755 index 4a0375b..0000000 --- a/sh_files/supervised_learning/semseg/pascal.sh +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=128 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="True" -PER_PIXEL="True" - -train_dataset='("pascal_parts_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"pascal_parts_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="PascalParts" -comment="coco_m2f" - -python3 "multi_node_train_net.py" \ ---config-file configs/supervised_train_net/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 -p "learnaccel" \ ---num-machines 4 \ ---name "pp_semseg" \ ---target "supervised_train_net.py" \ ---job-dir "output/${exp_name}/${grp_name}/${comment}/multi_node/" \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 200 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 20000 \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG False diff --git a/sh_files/supervised_learning/semseg/pascal_single.sh b/sh_files/supervised_learning/semseg/pascal_single.sh deleted file mode 100755 index d6fc101..0000000 --- a/sh_files/supervised_learning/semseg/pascal_single.sh +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. - -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -batch_size=16 -aug_list='["crop","scale","flip"]' -freeze_keys='[]' -LR="0.0001" -MAX_ITER='20000' -NORM="False" -PER_PIXEL="True" - -train_dataset='("pascal_parts_train",)' -val_dataset='(' -val_dataset=${val_dataset}'"pascal_parts_val",' -val_dataset=${val_dataset}')' - -oversample_ratio=3.0 -inverse_sampling=False -importance_sampling_ratio=0.75 - -exp_name="Semseg_Supervised_Learning" -grp_name="PascalParts" -comment="debug" - -python3 "supervised_train_net.py" \ ---config-file configs/supervised_learning/swinL_IN21K_384_mask2former.yaml \ ---num-gpus 8 \ ---num-machines 1 \ -OUTPUT_DIR "output/${exp_name}/${grp_name}/${comment}/" \ -VIS_OUTPUT_DIR "vis_logs/${exp_name}/${grp_name}/${comment}/" \ -DATASETS.TRAIN ${train_dataset} \ -DATASETS.TEST ${val_dataset} \ -MODEL.MASK_FORMER.FREEZE_KEYS ${freeze_keys} \ -MODEL.MASK_FORMER.QUERY_FEATURE_NORMALIZE ${NORM} \ -MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO ${importance_sampling_ratio} \ -MODEL.MASK_FORMER.OVERSAMPLE_RATIO ${oversample_ratio} \ -WANDB.DISABLE_WANDB False \ -WANDB.RUN_NAME ${comment} \ -WANDB.PROJECT ${exp_name} \ -WANDB.GROUP ${grp_name} \ -WANDB.VIS_PERIOD_TRAIN 20 \ -WANDB.VIS_PERIOD_TEST 50 \ -SOLVER.MAX_ITER ${MAX_ITER} \ -SOLVER.IMS_PER_BATCH ${batch_size} \ -SOLVER.BASE_LR ${LR} \ -SOLVER.STEPS '(40000, 45000)' \ -TEST.EVAL_PERIOD 100 \ -MODEL.SEM_SEG_HEAD.NUM_CLASSES 50 \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE False \ -SUPERVISED_MODEL.APPLY_MASKING_WITH_OBJECT_MASK True \ -SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING False \ -SUPERVISED_MODEL.USE_PER_PIXEL_LABEL ${PER_PIXEL} \ -CUSTOM_DATASETS.USE_MERGED_GT True \ -CUSTOM_DATASETS.AUG_NAME_LIST ${aug_list} \ -CUSTOM_DATASETS.PASCAL_PARTS.DEBUG True diff --git a/supervised_train_net.py b/supervised_train_net.py index af6a3b2..b3ea46a 100644 --- a/supervised_train_net.py +++ b/supervised_train_net.py @@ -13,14 +13,14 @@ except: pass -import sys -import os +import sys +import os import torch -import torch.nn as nn -import numpy as np +import torch.nn as nn +import numpy as np import logging import detectron2.utils.comm as comm -import wandb +import wandb sys.path.append('Detic/third_party/CenterNet2') sys.path.append('Detic/third_party/Deformable-DETR') @@ -30,8 +30,8 @@ from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.data import (MetadataCatalog, - build_detection_test_loader, +from detectron2.data import (MetadataCatalog, + build_detection_test_loader, build_detection_train_loader) from detectron2.engine import (default_argument_parser, @@ -43,9 +43,9 @@ from detectron2.utils.comm import is_main_process, synchronize from detectron2.evaluation import verify_results, inference_on_dataset, print_csv_format -from part_distillation import (add_maskformer2_config, - add_wandb_config, - add_supervised_model_config, +from part_distillation import (add_maskformer2_config, + add_wandb_config, + add_supervised_model_config, add_fewshot_learning_config, add_custom_datasets_config) @@ -69,7 +69,7 @@ def build_evaluator(self, cfg, dataset_name): if cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_LEARNING \ or cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE: return ProposalEvaluator() - else: + else: return Supervised_mIOU_Evaluator(dataset_name, cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES) @@ -78,10 +78,10 @@ def build_train_loader(self, cfg): if "pascal" in cfg.DATASETS.TRAIN[0]: mapper = VOCPartsMapper(cfg, is_train=True) elif "part_imagenet" in cfg.DATASETS.TRAIN[0]: - mapper = PartImageNetMapper(cfg, is_train=True) + mapper = PartImageNetMapper(cfg, cfg.DATASETS.TRAIN[0], is_train=True) elif "cityscapes" in cfg.DATASETS.TRAIN[0]: mapper = CityscapesPartMapper(cfg, is_train=True) - + return build_detection_train_loader(cfg, mapper=mapper) @@ -90,7 +90,7 @@ def build_test_loader(self, cfg, dataset_name): if "pascal" in dataset_name: mapper = VOCPartsMapper(cfg, is_train=False) elif "part_imagenet" in dataset_name: - mapper = PartImageNetMapper(cfg, is_train=False) + mapper = PartImageNetMapper(cfg, dataset_name, is_train=False) elif "cityscapes" in dataset_name: mapper = CityscapesPartMapper(cfg, is_train=False) @@ -116,7 +116,7 @@ def test(cls, cfg, model): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) comm.synchronize() - + if len(results) == 1: results = list(results.values())[0] @@ -125,7 +125,7 @@ def test(cls, cfg, model): wandb.log(results) return results - + def setup(args): """ @@ -144,16 +144,16 @@ def setup(args): cfg.freeze() default_setup(cfg, args) - # Setup logger + # Setup logger setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="supervised") - + # for part-imagenet mapping. register_imagenet("imagenet_1k_meta_train", "train", partitioned_imagenet=False) # register dataset if "part_imagenet" in cfg.DATASETS.TRAIN[0]: - register_part_imagenet(name=cfg.DATASETS.TRAIN[0], + register_part_imagenet(name=cfg.DATASETS.TRAIN[0], images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, split=cfg.DATASETS.TRAIN[0].split('_')[-1], @@ -170,7 +170,7 @@ def setup(args): path_only=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.PATH_ONLY, debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, ) - + elif "pascal" in cfg.DATASETS.TRAIN[0]: register_pascal_parts( name=cfg.DATASETS.TRAIN[0], @@ -184,10 +184,10 @@ def setup(args): ) else: raise ValueError("{} not supported.".format(dataset_name)) - + for dataset_name in cfg.DATASETS.TEST: if "part_imagenet" in dataset_name: - register_part_imagenet(name=dataset_name, + register_part_imagenet(name=dataset_name, images_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.IMAGES_DIRNAME, annotations_dirname=cfg.CUSTOM_DATASETS.PART_IMAGENET.ANNOTATIONS_DIRNAME, split=dataset_name.split('_')[-1], @@ -204,7 +204,7 @@ def setup(args): and (not cfg.SUPERVISED_MODEL.CLASS_AGNOSTIC_INFERENCE), debug=cfg.CUSTOM_DATASETS.CITYSCAPES_PART.DEBUG, ) - + elif "pascal" in dataset_name: register_pascal_parts( name=dataset_name, @@ -226,7 +226,7 @@ def setup(args): def main(args): cfg = setup(args) if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: - run_name = cfg.WANDB.RUN_NAME + run_name = cfg.WANDB.RUN_NAME if not os.path.exists(cfg.VIS_OUTPUT_DIR): os.makedirs(cfg.VIS_OUTPUT_DIR) wandb.init(project=cfg.WANDB.PROJECT, sync_tensorboard=True, name=run_name, @@ -249,7 +249,7 @@ def main(args): res = trainer.train() if comm.is_main_process() and not cfg.WANDB.DISABLE_WANDB: wandb.finish() - return res + return res if __name__ == "__main__":