commit fd82b5c7ebe5b1cacfb8d79144d1ad93589ca3bf Author: leaf <48828021+leafliber@users.noreply.github.com> Date: Wed Jun 23 08:58:10 2021 +0800 initialize diff --git a/.travis/precommit.sh b/.travis/precommit.sh new file mode 100644 index 0000000..bcbfb2b --- /dev/null +++ b/.travis/precommit.sh @@ -0,0 +1,21 @@ +#!/bin/bash +function abort(){ + echo "Your commit not fit PaddlePaddle code style" 1>&2 + echo "Please use pre-commit scripts to auto-format your code" 1>&2 + exit 1 +} + +trap 'abort' 0 +set -e +cd `dirname $0` +cd .. +export PATH=/usr/bin:$PATH +pre-commit install + +if ! pre-commit run -a ; then + ls -lh + git diff --exit-code + exit 1 +fi + +trap : 0 diff --git a/.travis/requirements.txt b/.travis/requirements.txt new file mode 100644 index 0000000..27a340d --- /dev/null +++ b/.travis/requirements.txt @@ -0,0 +1,8 @@ +# add python requirements for unittests here, note install pycocotools +# directly is not supported in travis ci, it is installed by compiling +# from source files in unittest.sh +tqdm +cython +shapely +llvmlite==0.33 +numba==0.50 diff --git a/.travis/unittest.sh b/.travis/unittest.sh new file mode 100644 index 0000000..e718331 --- /dev/null +++ b/.travis/unittest.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +abort(){ + echo "Run unittest failed" 1>&2 + echo "Please check your code" 1>&2 + echo " 1. you can run unit tests by 'bash .travis/unittest.sh' locally" 1>&2 + echo " 2. you can add python requirements in .travis/requirements.txt if you use new requirements in unit tests" 1>&2 + exit 1 +} + +unittest(){ + if [ $? != 0 ]; then + exit 1 + fi + find "./ppdet" -name 'tests' -type d -print0 | \ + xargs -0 -I{} -n1 bash -c \ + 'python -m unittest discover -v -s {}' +} + +trap 'abort' 0 +set -e + +# install travis python dependencies exclude pycocotools +if [ -f ".travis/requirements.txt" ]; then + pip install -r .travis/requirements.txt +fi + +# install pycocotools +if [ `pip list | grep pycocotools | wc -l` -eq 0 ]; then + # install git if needed + if [ -n `which git` ]; then + apt-get update + apt-get install -y git + fi; + git clone https://github.com/cocodataset/cocoapi.git + cd cocoapi/PythonAPI + make install + python setup.py install --user + cd ../.. + rm -rf cocoapi +fi + +export PYTHONPATH=`pwd`:$PYTHONPATH + +unittest . + +trap : 0 diff --git a/configs/cascade_rcnn/README.md b/configs/cascade_rcnn/README.md new file mode 100644 index 0000000..d93ec4f --- /dev/null +++ b/configs/cascade_rcnn/README.md @@ -0,0 +1,28 @@ +# Cascade R-CNN: High Quality Object Detection and Instance Segmentation + +## Model Zoo + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | Mask AP | 下载 | 配置文件 | +| :------------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----: | :-----------------------------------------------------: | :-----: | +| ResNet50-FPN | Cascade Faster | 1 | 1x | ---- | 41.1 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml) | +| ResNet50-FPN | Cascade Mask | 1 | 1x | ---- | 41.8 | 36.3 | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Cascade Faster | 1 | 1x | ---- | 44.4 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Cascade Faster | 1 | 2x | ---- | 45.0 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Cascade Mask | 1 | 1x | ---- | 44.9 | 39.1 | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Cascade Mask | 1 | 2x | ---- | 45.7 | 39.7 | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) | + + +## Citations +``` +@article{Cai_2019, + title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation}, + ISSN={1939-3539}, + url={http://dx.doi.org/10.1109/tpami.2019.2956516}, + DOI={10.1109/tpami.2019.2956516}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Cai, Zhaowei and Vasconcelos, Nuno}, + year={2019}, + pages={1–1} +} +``` diff --git a/configs/cascade_rcnn/_base_/cascade_fpn_reader.yml b/configs/cascade_rcnn/_base_/cascade_fpn_reader.yml new file mode 100644 index 0000000..cf54ecc --- /dev/null +++ b/configs/cascade_rcnn/_base_/cascade_fpn_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/cascade_rcnn/_base_/cascade_mask_fpn_reader.yml b/configs/cascade_rcnn/_base_/cascade_mask_fpn_reader.yml new file mode 100644 index 0000000..cf54ecc --- /dev/null +++ b/configs/cascade_rcnn/_base_/cascade_mask_fpn_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml new file mode 100644 index 0000000..ea2937b --- /dev/null +++ b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml @@ -0,0 +1,97 @@ +architecture: CascadeRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + + +CascadeRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: CascadeHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + + +CascadeHead: + head: CascadeTwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + cascade_iou: [0.5, 0.6, 0.7] + use_random: True + +CascadeTwoFCHead: + out_channel: 1024 + +BBoxPostProcess: + decode: + name: RCNNBox + prior_box_var: [30.0, 30.0, 15.0, 15.0] + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 + + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: False + +MaskFeat: + num_convs: 4 + out_channel: 256 + +MaskAssigner: + mask_resolution: 28 + +MaskPostProcess: + binary_thresh: 0.5 diff --git a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml new file mode 100644 index 0000000..c5afe77 --- /dev/null +++ b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml @@ -0,0 +1,75 @@ +architecture: CascadeRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + + +CascadeRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: CascadeHead + # post process + bbox_post_process: BBoxPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + + +CascadeHead: + head: CascadeTwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + cascade_iou: [0.5, 0.6, 0.7] + use_random: True + +CascadeTwoFCHead: + out_channel: 1024 + +BBoxPostProcess: + decode: + name: RCNNBox + prior_box_var: [30.0, 30.0, 15.0, 15.0] + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/configs/cascade_rcnn/_base_/optimizer_1x.yml b/configs/cascade_rcnn/_base_/optimizer_1x.yml new file mode 100644 index 0000000..63f898e --- /dev/null +++ b/configs/cascade_rcnn/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.001 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.yml b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..b2c7e53 --- /dev/null +++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_mask_rcnn_r50_fpn.yml', + '_base_/cascade_mask_fpn_reader.yml', +] +weights: output/cascade_mask_rcnn_r50_fpn_1x_coco/model_final diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml b/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml new file mode 100644 index 0000000..0ab507c --- /dev/null +++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml @@ -0,0 +1,18 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_mask_rcnn_r50_fpn.yml', + '_base_/cascade_mask_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/cascade_mask_rcnn_r50_vd_fpn_ssld_1x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] diff --git a/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml b/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml new file mode 100644 index 0000000..736ba2e --- /dev/null +++ b/configs/cascade_rcnn/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_mask_rcnn_r50_fpn.yml', + '_base_/cascade_mask_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/cascade_mask_rcnn_r50_vd_fpn_ssld_2x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [12, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..b2cc799 --- /dev/null +++ b/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_rcnn_r50_fpn.yml', + '_base_/cascade_fpn_reader.yml', +] +weights: output/cascade_rcnn_r50_fpn_1x_coco/model_final diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml b/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml new file mode 100644 index 0000000..905adbd --- /dev/null +++ b/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_1x_coco.yml @@ -0,0 +1,18 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_rcnn_r50_fpn.yml', + '_base_/cascade_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/cascade_rcnn_r50_vd_fpn_ssld_1x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] diff --git a/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml b/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml new file mode 100644 index 0000000..a627214 --- /dev/null +++ b/configs/cascade_rcnn/cascade_rcnn_r50_vd_fpn_ssld_2x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/cascade_rcnn_r50_fpn.yml', + '_base_/cascade_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/cascade_rcnn_r50_vd_fpn_ssld_2x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [12, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/datasets/coco_detection.yml b/configs/datasets/coco_detection.yml new file mode 100644 index 0000000..7a62c3b --- /dev/null +++ b/configs/datasets/coco_detection.yml @@ -0,0 +1,19 @@ +metric: COCO +num_classes: 80 + +TrainDataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + +TestDataset: + !ImageFolder + anno_path: annotations/instances_val2017.json diff --git a/configs/datasets/coco_instance.yml b/configs/datasets/coco_instance.yml new file mode 100644 index 0000000..5eaf767 --- /dev/null +++ b/configs/datasets/coco_instance.yml @@ -0,0 +1,19 @@ +metric: COCO +num_classes: 80 + +TrainDataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_poly', 'is_crowd'] + +EvalDataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + +TestDataset: + !ImageFolder + anno_path: annotations/instances_val2017.json diff --git a/configs/datasets/dota.yml b/configs/datasets/dota.yml new file mode 100644 index 0000000..2953a79 --- /dev/null +++ b/configs/datasets/dota.yml @@ -0,0 +1,20 @@ +metric: COCO +num_classes: 15 + +TrainDataset: + !COCODataSet + image_dir: trainval_split/images + anno_path: trainval_split/s2anet_trainval_paddle_coco.json + dataset_dir: dataset/DOTA_1024_s2anet + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_rbox'] + +EvalDataset: + !COCODataSet + image_dir: trainval_split/images + anno_path: trainval_split/s2anet_trainval_paddle_coco.json + dataset_dir: dataset/DOTA_1024_s2anet/ + +TestDataset: + !ImageFolder + anno_path: trainval_split/s2anet_trainval_paddle_coco.json + dataset_dir: dataset/DOTA_1024_s2anet/ diff --git a/configs/datasets/roadsign_voc.yml b/configs/datasets/roadsign_voc.yml new file mode 100644 index 0000000..ddbfc78 --- /dev/null +++ b/configs/datasets/roadsign_voc.yml @@ -0,0 +1,21 @@ +metric: VOC +map_type: integral +num_classes: 4 + +TrainDataset: + !VOCDataSet + dataset_dir: dataset/roadsign_voc + anno_path: train.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +EvalDataset: + !VOCDataSet + dataset_dir: dataset/roadsign_voc + anno_path: valid.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +TestDataset: + !ImageFolder + anno_path: dataset/roadsign_voc/label_list.txt diff --git a/configs/datasets/voc.yml b/configs/datasets/voc.yml new file mode 100644 index 0000000..9fb492f --- /dev/null +++ b/configs/datasets/voc.yml @@ -0,0 +1,21 @@ +metric: VOC +map_type: 11point +num_classes: 20 + +TrainDataset: + !VOCDataSet + dataset_dir: dataset/voc + anno_path: trainval.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +EvalDataset: + !VOCDataSet + dataset_dir: dataset/voc + anno_path: test.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +TestDataset: + !ImageFolder + anno_path: dataset/voc/label_list.txt diff --git a/configs/datasets/wider_face.yml b/configs/datasets/wider_face.yml new file mode 100644 index 0000000..cc01378 --- /dev/null +++ b/configs/datasets/wider_face.yml @@ -0,0 +1,20 @@ +metric: WiderFace +num_classes: 1 + +TrainDataset: + !WIDERFaceDataSet + dataset_dir: dataset/wider_face + anno_path: wider_face_split/wider_face_train_bbx_gt.txt + image_dir: WIDER_train/images + data_fields: ['image', 'gt_bbox', 'gt_class'] + +EvalDataset: + !WIDERFaceDataSet + dataset_dir: dataset/wider_face + anno_path: wider_face_split/wider_face_val_bbx_gt.txt + image_dir: WIDER_val/images + data_fields: ['image'] + +TestDataset: + !ImageFolder + use_default_label: true diff --git a/configs/dcn/README.md b/configs/dcn/README.md new file mode 100644 index 0000000..9c8613f --- /dev/null +++ b/configs/dcn/README.md @@ -0,0 +1,37 @@ +### Deformable ConvNets v2 + +| 骨架网络 | 网络类型 | 卷积 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | Mask AP | 下载 | 配置文件 | +| :------------------- | :------------- | :-----: |:--------: | :-----: | :-----------: |:----: | :-----: | :----------------------------------------------------------: | :----: | +| ResNet50-FPN | Faster | c3-c5 | 1 | 1x | - | 42.1 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_dcn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/faster_rcnn_dcn_r50_fpn_1x_coco.yml) | +| ResNet50-vd-FPN | Faster | c3-c5 | 1 | 1x | - | 42.7 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_dcn_r50_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_1x_coco.yml) | +| ResNet50-vd-FPN | Faster | c3-c5 | 1 | 2x | - | 43.7 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_dcn_r50_vd_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x_coco.yml) | +| ResNet101-vd-FPN | Faster | c3-c5 | 1 | 1x | - | 45.1 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_dcn_r101_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Faster | c3-c5 | 1 | 1x | - | 46.5 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.pdparams) |[配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml) | +| ResNet50-FPN | Mask | c3-c5 | 1 | 1x | - | 42.7 | 38.4 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_dcn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/mask_rcnn_dcn_r50_fpn_1x_coco.yml) | +| ResNet50-vd-FPN | Mask | c3-c5 | 1 | 2x | - | 44.6 | 39.8 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_dcn_r50_vd_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x_coco.yml) | +| ResNet101-vd-FPN | Mask | c3-c5 | 1 | 1x | - | 45.6 | 40.6 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_dcn_r101_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Mask | c3-c5 | 1 | 1x | - | 47.3 | 42.0 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml) | +| ResNet50-FPN | Cascade Faster | c3-c5 | 1 | 1x | - | 42.1 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_dcn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Cascade Faster | c3-c5 | 1 | 1x | - | 48.8 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml) | + + +**注意事项:** + +- Deformable卷积网络v2(dcn_v2)参考自论文[Deformable ConvNets v2](https://arxiv.org/abs/1811.11168). +- `c3-c5`意思是在resnet模块的3到5阶段增加`dcn`. + +## Citations +``` +@inproceedings{dai2017deformable, + title={Deformable Convolutional Networks}, + author={Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2017} +} +@article{zhu2018deformable, + title={Deformable ConvNets v2: More Deformable, Better Results}, + author={Zhu, Xizhou and Hu, Han and Lin, Stephen and Dai, Jifeng}, + journal={arXiv preprint arXiv:1811.11168}, + year={2018} +} +``` diff --git a/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x_coco.yml b/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..9f2738f --- /dev/null +++ b/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x_coco.yml @@ -0,0 +1,16 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../cascade_rcnn/_base_/optimizer_1x.yml', + '../cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml', + '../cascade_rcnn/_base_/cascade_fpn_reader.yml', +] +weights: output/cascade_rcnn_dcn_r50_fpn_1x_coco/model_final + +ResNet: + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml b/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml new file mode 100644 index 0000000..4180919 --- /dev/null +++ b/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml @@ -0,0 +1,16 @@ +_BASE_: [ + 'cascade_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco/model_final + +ResNet: + depth: 101 + groups: 64 + base_width: 4 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x_coco.yml b/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..274c171 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'faster_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_pretrained.pdparams +weights: output/faster_rcnn_dcn_r101_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/faster_rcnn_dcn_r50_fpn_1x_coco.yml b/configs/dcn/faster_rcnn_dcn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..1cd02ac --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r50_fpn_1x_coco.yml @@ -0,0 +1,16 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../faster_rcnn/_base_/optimizer_1x.yml', + '../faster_rcnn/_base_/faster_rcnn_r50_fpn.yml', + '../faster_rcnn/_base_/faster_fpn_reader.yml', +] +weights: output/faster_rcnn_dcn_r50_fpn_1x_coco/model_final + +ResNet: + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_1x_coco.yml b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..735edbb --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_1x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'faster_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/faster_rcnn_dcn_r50_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x_coco.yml b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x_coco.yml new file mode 100644 index 0000000..685d967 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x_coco.yml @@ -0,0 +1,26 @@ +_BASE_: [ + 'faster_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/faster_rcnn_dcn_r50_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml b/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml new file mode 100644 index 0000000..68fef48 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml @@ -0,0 +1,17 @@ +_BASE_: [ + 'faster_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + groups: 64 + base_width: 4 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x_coco.yml b/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..930bd89 --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'mask_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_pretrained.pdparams +weights: output/mask_rcnn_dcn_r101_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/mask_rcnn_dcn_r50_fpn_1x_coco.yml b/configs/dcn/mask_rcnn_dcn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..b14a1ed --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r50_fpn_1x_coco.yml @@ -0,0 +1,16 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '../mask_rcnn/_base_/optimizer_1x.yml', + '../mask_rcnn/_base_/mask_rcnn_r50_fpn.yml', + '../mask_rcnn/_base_/mask_fpn_reader.yml', +] +weights: output/mask_rcnn_dcn_r50_fpn_1x_coco/model_final + +ResNet: + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x_coco.yml b/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x_coco.yml new file mode 100644 index 0000000..d36b5f5 --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x_coco.yml @@ -0,0 +1,26 @@ +_BASE_: [ + 'mask_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/mask_rcnn_dcn_r50_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml b/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml new file mode 100644 index 0000000..8e7857c --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco.yml @@ -0,0 +1,17 @@ +_BASE_: [ + 'mask_rcnn_dcn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + variant: d + groups: 64 + base_width: 4 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] diff --git a/configs/dota/README.md b/configs/dota/README.md new file mode 100644 index 0000000..3fe6bd8 --- /dev/null +++ b/configs/dota/README.md @@ -0,0 +1,125 @@ +# S2ANet模型 + +## 内容 +- [简介](#简介) +- [DOTA数据集](#DOTA数据集) +- [模型库](#模型库) +- [训练说明](#训练说明) + +## 简介 + +[S2ANet](https://arxiv.org/pdf/2008.09397.pdf)是用于检测旋转框的模型,要求使用PaddlePaddle 2.0.1(可使用pip安装) 或适当的[develop版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#whl-release)。 + + +## DOTA数据集 +[DOTA Dataset]是航空影像中物体检测的数据集,包含2806张图像,每张图像4000*4000分辨率。 + +| 数据版本 | 类别数 | 图像数 | 图像尺寸 | 实例数 | 标注方式 | +|:--------:|:-------:|:---------:|:---------:| :---------:| :------------: | +| v1.0 | 15 | 2806 | 800~4000 | 118282 | OBB + HBB | +| v1.5 | 16 | 2806 | 800~4000 | 400000 | OBB + HBB | + +注:OBB标注方式是指标注任意四边形;顶点按顺时针顺序排列。HBB标注方式是指标注示例的外接矩形。 + +DOTA数据集中总共有2806张图像,其中1411张图像作为训练集,458张图像作为评估集,剩余937张图像作为测试集。 + +如果需要切割图像数据,请参考[DOTA_devkit](https://github.com/CAPTAIN-WHU/DOTA_devkit) 。 + +设置`crop_size=1024, stride=824, gap=200`参数切割数据后,训练集15749张图像,评估集5297张图像,测试集10833张图像。 + +## 模型库 + +### S2ANet模型 + +| 模型 | GPU个数 | Conv类型 | mAP | 模型下载 | 配置文件 | +|:-----------:|:-------:|:----------:|:--------:| :----------:| :---------: | +| S2ANet | 8 | Conv | 71.42 | [model](https://paddledet.bj.bcebos.com/models/s2anet_conv_1x_dota.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/dota/s2anet_conv_1x_dota.yml) | + +**注意:**这里使用`multiclass_nms`,与原作者使用nms略有不同,精度相比原始论文中高0.15 (71.27-->71.42)。 + +## 训练说明 + +### 1. 旋转框IOU计算OP + +旋转框IOU计算OP[ext_op](../../ppdet/ext_op)是参考Paddle[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/07_new_op/new_custom_op.html) 的方式开发。 + +若使用旋转框IOU计算OP,需要环境满足: +- PaddlePaddle >= 2.0.1 +- GCC == 8.2 + +推荐使用docker镜像[paddle:2.0.1-gpu-cuda10.1-cudnn7](registry.baidubce.com/paddlepaddle/paddle:2.0.1-gpu-cuda10.1-cudnn7)。 + +执行如下命令下载镜像并启动容器: +``` +sudo nvidia-docker run -it --name paddle_s2anet -v $PWD:/paddle --network=host registry.baidubce.com/paddlepaddle/paddle:2.0.1-gpu-cuda10.1-cudnn7 /bin/bash +``` + +镜像中paddle2.0.1已安装好,进入python3.7,执行如下代码检查paddle安装是否正常: +``` +import paddle +print(paddle.__version__) +paddle.utils.run_check() +``` + +进入到`ppdet/ext_op`文件夹,安装: +``` +python3.7 setup.py install +``` + +Windows环境请按照如下步骤安装: + +(1)准备Visual Studio (版本需要>=Visual Studio 2015 update3),这里以VS2017为例; + +(2)点击开始-->Visual Studio 2017-->适用于 VS 2017 的x64本机工具命令提示; + +(3)设置环境变量:`set DISTUTILS_USE_SDK=1` + +(4)进入`PaddleDetection/ppdet/ext_op`目录,通过`python3.7 setup.py install`命令进行安装。 + +安装完成后,测试自定义op是否可以正常编译以及计算结果: +``` +cd PaddleDetecetion/ppdet/ext_op +python3.7 test.py +``` + +### 2. 数据格式 +DOTA 数据集中实例是按照任意四边形标注,在进行训练模型前,需要参考[DOTA2COCO](https://github.com/CAPTAIN-WHU/DOTA_devkit/blob/master/DOTA2COCO.py) 转换成`[xc, yc, bow_w, bow_h, angle]`格式,并以coco数据格式存储。 + +## 评估 + +执行如下命令,会在`output_dir`文件夹下将每个图像预测结果保存到同文件夹名的txt文本中。 +``` +python3.7 tools/infer.py -c configs/dota/s2anet_1x_dota.yml -o weights=./weights/s2anet_1x_dota.pdparams --infer_dir=dota_test_images --draw_threshold=0.05 --save_txt=True --output_dir=output +``` + + +请参考[DOTA_devkit](https://github.com/CAPTAIN-WHU/DOTA_devkit) 生成评估文件,评估文件格式请参考[DOTA Test](http://captain.whu.edu.cn/DOTAweb/tasks.html) ,生成zip文件,每个类一个txt文件,txt文件中每行格式为:`image_id score x1 y1 x2 y2 x3 y3 x4 y4`,提交服务器进行评估。 + +## 预测部署 + +Paddle中`multiclass_nms`算子的输入支持四边形输入,因此部署时可以不不需要依赖旋转框IOU计算算子。 + +```bash +# 预测 +CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/dota/s2anet_1x_dota.yml -o weights=model.pdparams --infer_img=demo/P0072__1.0__0___0.png +``` + + +## Citations +``` +@article{han2021align, + author={J. {Han} and J. {Ding} and J. {Li} and G. -S. {Xia}}, + journal={IEEE Transactions on Geoscience and Remote Sensing}, + title={Align Deep Features for Oriented Object Detection}, + year={2021}, + pages={1-11}, + doi={10.1109/TGRS.2021.3062048}} + +@inproceedings{xia2018dota, + title={DOTA: A large-scale dataset for object detection in aerial images}, + author={Xia, Gui-Song and Bai, Xiang and Ding, Jian and Zhu, Zhen and Belongie, Serge and Luo, Jiebo and Datcu, Mihai and Pelillo, Marcello and Zhang, Liangpei}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={3974--3983}, + year={2018} +} +``` diff --git a/configs/dota/_base_/s2anet.yml b/configs/dota/_base_/s2anet.yml new file mode 100644 index 0000000..f4e4974 --- /dev/null +++ b/configs/dota/_base_/s2anet.yml @@ -0,0 +1,55 @@ +architecture: S2ANet +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams +weights: output/s2anet_r50_fpn_1x_dota/model_final.pdparams + + +# Model Achitecture +S2ANet: + backbone: ResNet + neck: FPN + s2anet_head: S2ANetHead + s2anet_bbox_post_process: S2ANetBBoxPostProcess + +ResNet: + depth: 50 + norm_type: bn + return_idx: [1,2,3] + num_stages: 4 + +FPN: + in_channels: [256, 512, 1024] + out_channel: 256 + spatial_scales: [0.25, 0.125, 0.0625] + has_extra_convs: True + extra_stage: 2 + relu_before_extra_convs: False + +S2ANetHead: + anchor_strides: [8, 16, 32, 64, 128] + anchor_scales: [4] + anchor_ratios: [1.0] + anchor_assign: RBoxAssigner + stacked_convs: 2 + feat_in: 256 + feat_out: 256 + num_classes: 15 + align_conv_type: 'Conv' # AlignConv Conv + align_conv_size: 3 + use_sigmoid_cls: True + +RBoxAssigner: + pos_iou_thr: 0.5 + neg_iou_thr: 0.4 + min_iou_thr: 0.0 + ignore_iof_thr: -2 + +S2ANetBBoxPostProcess: + nms_pre: 2000 + min_bbox_size: 0.0 + nms: + name: MultiClassNMS + keep_top_k: -1 + score_threshold: 0.05 + nms_threshold: 0.1 + normalized: False + #background_label: -1 diff --git a/configs/dota/_base_/s2anet_optimizer_1x.yml b/configs/dota/_base_/s2anet_optimizer_1x.yml new file mode 100644 index 0000000..65f794d --- /dev/null +++ b/configs/dota/_base_/s2anet_optimizer_1x.yml @@ -0,0 +1,20 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [7, 10] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + clip_grad_by_norm: 35 diff --git a/configs/dota/_base_/s2anet_reader.yml b/configs/dota/_base_/s2anet_reader.yml new file mode 100644 index 0000000..c3df7a0 --- /dev/null +++ b/configs/dota/_base_/s2anet_reader.yml @@ -0,0 +1,42 @@ +worker_num: 0 +TrainReader: + sample_transforms: + - Decode: {} + - Rbox2Poly: {} + # Resize can process rbox + - Resize: {target_size: [1024, 1024], interp: 2, keep_ratio: False} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - RboxPadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [1024, 1024], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - RboxPadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [1024, 1024], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - RboxPadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/dota/s2anet_1x_dota.yml b/configs/dota/s2anet_1x_dota.yml new file mode 100644 index 0000000..d480c1c --- /dev/null +++ b/configs/dota/s2anet_1x_dota.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/dota.yml', + '../runtime.yml', + '_base_/s2anet_optimizer_1x.yml', + '_base_/s2anet.yml', + '_base_/s2anet_reader.yml', +] +weights: output/s2anet_1x_dota/model_final diff --git a/configs/dota/s2anet_conv_1x_dota.yml b/configs/dota/s2anet_conv_1x_dota.yml new file mode 100644 index 0000000..60931b1 --- /dev/null +++ b/configs/dota/s2anet_conv_1x_dota.yml @@ -0,0 +1,21 @@ +_BASE_: [ + '../datasets/dota.yml', + '../runtime.yml', + '_base_/s2anet_optimizer_1x.yml', + '_base_/s2anet.yml', + '_base_/s2anet_reader.yml', +] +weights: output/s2anet_1x_dota/model_final + +S2ANetHead: + anchor_strides: [8, 16, 32, 64, 128] + anchor_scales: [4] + anchor_ratios: [1.0] + anchor_assign: RBoxAssigner + stacked_convs: 2 + feat_in: 256 + feat_out: 256 + num_classes: 15 + align_conv_type: 'Conv' # AlignConv Conv + align_conv_size: 3 + use_sigmoid_cls: True diff --git a/configs/face_detection/README.md b/configs/face_detection/README.md new file mode 100644 index 0000000..3f0fe24 --- /dev/null +++ b/configs/face_detection/README.md @@ -0,0 +1,106 @@ +# 人脸检测模型 + +## 简介 +`face_detection`中提供高效、高速的人脸检测解决方案,包括最先进的模型和经典模型。 + +![](../../docs/images/12_Group_Group_12_Group_Group_12_935.jpg) + +## 模型库 + +#### WIDER-FACE数据集上的mAP + +| 网络结构 | 输入尺寸 | 图片个数/GPU | 学习率策略 | Easy/Medium/Hard Set | 预测时延(SD855)| 模型大小(MB) | 下载 | 配置文件 | +|:------------:|:--------:|:----:|:-------:|:-------:|:---------:|:----------:|:---------:|:--------:| +| BlazeFace | 640 | 8 | 1000e | 0.885 / 0.855 / 0.731 | - | 0.472 |[下载链接](https://paddledet.bj.bcebos.com/models/blazeface_1000e.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/face_detection/blazeface_1000e.yml) | + +**注意:** +- 我们使用多尺度评估策略得到`Easy/Medium/Hard Set`里的mAP。具体细节请参考[在WIDER-FACE数据集上评估](#在WIDER-FACE数据集上评估)。 + +## 快速开始 + +### 数据准备 +我们使用[WIDER-FACE数据集](http://shuoyang1213.me/WIDERFACE/)进行训练和模型测试,官方网站提供了详细的数据介绍。 +- WIDER-Face数据源: +使用如下目录结构加载`wider_face`类型的数据集: + + ``` + dataset/wider_face/ + ├── wider_face_split + │ ├── wider_face_train_bbx_gt.txt + │ ├── wider_face_val_bbx_gt.txt + ├── WIDER_train + │ ├── images + │ │ ├── 0--Parade + │ │ │ ├── 0_Parade_marchingband_1_100.jpg + │ │ │ ├── 0_Parade_marchingband_1_381.jpg + │ │ │ │ ... + │ │ ├── 10--People_Marching + │ │ │ ... + ├── WIDER_val + │ ├── images + │ │ ├── 0--Parade + │ │ │ ├── 0_Parade_marchingband_1_1004.jpg + │ │ │ ├── 0_Parade_marchingband_1_1045.jpg + │ │ │ │ ... + │ │ ├── 10--People_Marching + │ │ │ ... + ``` + +- 手动下载数据集: +要下载WIDER-FACE数据集,请运行以下命令: +``` +cd dataset/wider_face && ./download_wider_face.sh +``` + +### 训练与评估 +训练流程与评估流程方法与其他算法一致,请参考[GETTING_STARTED_cn.md](../../docs/tutorials/GETTING_STARTED_cn.md)。 +**注意:** 人脸检测模型目前不支持边训练边评估。 + +#### 在WIDER-FACE数据集上评估 +- 步骤一:评估并生成结果文件: +```shell +python -u tools/eval.py -c configs/face_detection/blazeface_1000e.yml \ + -o weights=output/blazeface_1000e/model_final \ + multi_scale=True +``` +设置`multi_scale=True`进行多尺度评估,评估完成后,将在`output/pred`中生成txt格式的测试结果。 + +- 步骤二:下载官方评估脚本和Ground Truth文件: +``` +wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/eval_script/eval_tools.zip +unzip eval_tools.zip && rm -f eval_tools.zip +``` + +- 步骤三:开始评估 + +方法一:python评估: +``` +git clone https://github.com/wondervictor/WiderFace-Evaluation.git +cd WiderFace-Evaluation +# 编译 +python3 setup.py build_ext --inplace +# 开始评估 +python3 evaluation.py -p /path/to/PaddleDetection/output/pred -g /path/to/eval_tools/ground_truth +``` + +方法二:MatLab评估: +``` +# 在`eval_tools/wider_eval.m`中修改保存结果路径和绘制曲线的名称: +pred_dir = './pred'; +legend_name = 'Paddle-BlazeFace'; + +`wider_eval.m` 是评估模块的主要执行程序。运行命令如下: +matlab -nodesktop -nosplash -nojvm -r "run wider_eval.m;quit;" +``` + + +## Citations + +``` +@article{bazarevsky2019blazeface, + title={BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs}, + author={Valentin Bazarevsky and Yury Kartynnik and Andrey Vakunov and Karthik Raveendran and Matthias Grundmann}, + year={2019}, + eprint={1907.05047}, + archivePrefix={arXiv}, +``` diff --git a/configs/face_detection/_base_/blazeface.yml b/configs/face_detection/_base_/blazeface.yml new file mode 100644 index 0000000..469aa9c --- /dev/null +++ b/configs/face_detection/_base_/blazeface.yml @@ -0,0 +1,39 @@ +architecture: SSD + +SSD: + backbone: BlazeNet + ssd_head: FaceHead + post_process: BBoxPostProcess + +BlazeNet: + blaze_filters: [[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]] + double_blaze_filters: [[48, 24, 96, 2], [96, 24, 96], [96, 24, 96], + [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]] + +FaceHead: + in_channels: [96, 96] + anchor_generator: AnchorGeneratorSSD + loss: SSDLoss + +SSDLoss: + overlap_threshold: 0.35 + +AnchorGeneratorSSD: + steps: [8., 16.] + aspect_ratios: [[1.], [1.]] + min_sizes: [[16.,24.], [32., 48., 64., 80., 96., 128.]] + max_sizes: [[], []] + offset: 0.5 + flip: False + min_max_aspect_ratios_order: false + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 750 + score_threshold: 0.01 + nms_threshold: 0.3 + nms_top_k: 5000 + nms_eta: 1.0 diff --git a/configs/face_detection/_base_/face_reader.yml b/configs/face_detection/_base_/face_reader.yml new file mode 100644 index 0000000..7b31b49 --- /dev/null +++ b/configs/face_detection/_base_/face_reader.yml @@ -0,0 +1,45 @@ +worker_num: 2 +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - RandomDistort: {brightness: [0.5, 1.125, 0.875], random_apply: False} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomFlip: {} + - CropWithDataAchorSampling: { + anchor_sampler: [[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]], + batch_sampler: [ + [1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + ], + target_size: 640} + - Resize: {target_size: [640, 640], keep_ratio: False, interp: 1} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + batch_transforms: + - NormalizeImage: {mean: [123, 117, 104], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 8 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {mean: [123, 117, 104], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 1 + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {mean: [123, 117, 104], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 1 diff --git a/configs/face_detection/_base_/optimizer_1000e.yml b/configs/face_detection/_base_/optimizer_1000e.yml new file mode 100644 index 0000000..d67da4c --- /dev/null +++ b/configs/face_detection/_base_/optimizer_1000e.yml @@ -0,0 +1,21 @@ +epoch: 1000 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 333 + - 800 + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSProp + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/face_detection/blazeface_1000e.yml b/configs/face_detection/blazeface_1000e.yml new file mode 100644 index 0000000..58fc908 --- /dev/null +++ b/configs/face_detection/blazeface_1000e.yml @@ -0,0 +1,9 @@ +_BASE_: [ + '../datasets/wider_face.yml', + '../runtime.yml', + '_base_/optimizer_1000e.yml', + '_base_/blazeface.yml', + '_base_/face_reader.yml', +] +weights: output/blazeface_1000e/model_final +multi_scale_eval: True diff --git a/configs/faster_rcnn/README.md b/configs/faster_rcnn/README.md new file mode 100644 index 0000000..a7e08ab --- /dev/null +++ b/configs/faster_rcnn/README.md @@ -0,0 +1,35 @@ +# Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks + +## Model Zoo + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | 下载 | 配置文件 | +| :------------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| ResNet50 | Faster | 1 | 1x | ---- | 36.7 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml) | +| ResNet50-vd | Faster | 1 | 1x | ---- | 37.6 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_vd_1x_coco.yml) | +| ResNet101 | Faster | 1 | 1x | ---- | 39.0 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r101_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r101_1x_coco.yml) | +| ResNet34-FPN | Faster | 1 | 1x | ---- | 37.8 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r34_fpn_1x_coco.yml) | +| ResNet34-vd-FPN | Faster | 1 | 1x | ---- | 38.5 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r34_vd_fpn_1x_coco.yml) | +| ResNet50-FPN | Faster | 1 | 1x | ---- | 38.4 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml) | +| ResNet50-FPN | Faster | 1 | 2x | ---- | 40.0 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.yml) | +| ResNet50-vd-FPN | Faster | 1 | 1x | ---- | 39.5 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_1x_coco.yml) | +| ResNet50-vd-FPN | Faster | 1 | 2x | ---- | 40.8 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_2x_coco.yml) | +| ResNet101-FPN | Faster | 1 | 2x | ---- | 41.4 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r101_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.yml) | +| ResNet101-vd-FPN | Faster | 1 | 1x | ---- | 42.0 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r101_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_1x_coco.yml) | +| ResNet101-vd-FPN | Faster | 1 | 2x | ---- | 43.0 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r101_vd_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_2x_coco.yml) | +| ResNeXt101-vd-FPN | Faster | 1 | 1x | ---- | 43.4 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Faster | 1 | 2x | ---- | 44.0 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_x101_vd_64x4d_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_2x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Faster | 1 | 1x | ---- | 41.4 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Faster | 1 | 2x | ---- | 42.3 | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_vd_ssld_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/faster_rcnn/faster_rcnn_r50_vd_ssld_fpn_2x_coco.yml) | + + +## Citations +``` +@article{Ren_2017, + title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + year={2017}, + month={Jun}, +} +``` diff --git a/configs/faster_rcnn/_base_/faster_fpn_reader.yml b/configs/faster_rcnn/_base_/faster_fpn_reader.yml new file mode 100644 index 0000000..cf54ecc --- /dev/null +++ b/configs/faster_rcnn/_base_/faster_fpn_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/faster_rcnn/_base_/faster_rcnn_r50.yml b/configs/faster_rcnn/_base_/faster_rcnn_r50.yml new file mode 100644 index 0000000..fd29f5e --- /dev/null +++ b/configs/faster_rcnn/_base_/faster_rcnn_r50.yml @@ -0,0 +1,66 @@ +architecture: FasterRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +FasterRCNN: + backbone: ResNet + rpn_head: RPNHead + bbox_head: BBoxHead + # post process + bbox_post_process: BBoxPostProcess + + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [2] + num_stages: 3 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [32, 64, 128, 256, 512] + strides: [16] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + topk_after_collect: False + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + + +BBoxHead: + head: Res5Head + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + with_pool: true + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml new file mode 100644 index 0000000..38ee81d --- /dev/null +++ b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml @@ -0,0 +1,73 @@ +architecture: FasterRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +FasterRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + # post process + bbox_post_process: BBoxPostProcess + + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 1000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + + +BBoxHead: + head: TwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +TwoFCHead: + out_channel: 1024 + + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/configs/faster_rcnn/_base_/faster_reader.yml b/configs/faster_rcnn/_base_/faster_reader.yml new file mode 100644 index 0000000..ebb4e79 --- /dev/null +++ b/configs/faster_rcnn/_base_/faster_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/faster_rcnn/_base_/optimizer_1x.yml b/configs/faster_rcnn/_base_/optimizer_1x.yml new file mode 100644 index 0000000..4caaa63 --- /dev/null +++ b/configs/faster_rcnn/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/faster_rcnn/faster_rcnn_r101_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r101_1x_coco.yml new file mode 100644 index 0000000..8876426 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r101_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_pretrained.pdparams +weights: output/faster_rcnn_r101_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + norm_type: bn + freeze_at: 0 + return_idx: [2] + num_stages: 3 diff --git a/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.yml new file mode 100644 index 0000000..a2e5ee5 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r101_fpn_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_pretrained.pdparams +weights: output/faster_rcnn_r101_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.yml new file mode 100644 index 0000000..0a07dec --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r101_fpn_2x_coco.yml @@ -0,0 +1,25 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_pretrained.pdparams +weights: output/faster_rcnn_r101_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..32e308b --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_pretrained.pdparams +weights: output/faster_rcnn_r101_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_2x_coco.yml new file mode 100644 index 0000000..65b8226 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r101_vd_fpn_2x_coco.yml @@ -0,0 +1,25 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_pretrained.pdparams +weights: output/faster_rcnn_r101_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_r34_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r34_fpn_1x_coco.yml new file mode 100644 index 0000000..f108352 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r34_fpn_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet34_pretrained.pdparams +weights: output/faster_rcnn_r34_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 34 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_r34_vd_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r34_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..5cf576b --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r34_vd_fpn_1x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet34_vd_pretrained.pdparams +weights: output/faster_rcnn_r34_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 34 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml new file mode 100644 index 0000000..a49bde8 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/faster_rcnn_r50.yml', + '_base_/faster_reader.yml', +] +weights: output/faster_rcnn_r50_1x_coco/model_final diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..e7b4518 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/faster_rcnn_r50_fpn.yml', + '_base_/faster_fpn_reader.yml', +] +weights: output/faster_rcnn_r50_fpn_1x_coco/model_final diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.yml new file mode 100644 index 0000000..7edaadc --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_2x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] +weights: output/faster_rcnn_r50_fpn_2x_coco/model_final + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_r50_vd_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_vd_1x_coco.yml new file mode 100644 index 0000000..ac0e720 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_vd_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/faster_rcnn_r50_vd_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [2] + num_stages: 3 diff --git a/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..6bf9d71 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/faster_rcnn_r50_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_2x_coco.yml new file mode 100644 index 0000000..7fc3a88 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_2x_coco.yml @@ -0,0 +1,25 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/faster_rcnn_r50_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml new file mode 100644 index 0000000..d71b82d --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_1x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/faster_rcnn_r50_fpn.yml', + '_base_/faster_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/faster_rcnn_r50_vd_fpn_ssld_1x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 12 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_2x_coco.yml new file mode 100644 index 0000000..0562354 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_r50_vd_fpn_ssld_2x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/faster_rcnn_r50_fpn.yml', + '_base_/faster_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/faster_rcnn_r50_vd_fpn_ssld_2x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [12, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.yml b/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.yml new file mode 100644 index 0000000..317d374 --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_1x_coco.yml @@ -0,0 +1,17 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/faster_rcnn_x101_vd_64x4d_fpn_1x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + groups: 64 + base_width: 4 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_2x_coco.yml b/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_2x_coco.yml new file mode 100644 index 0000000..939878f --- /dev/null +++ b/configs/faster_rcnn/faster_rcnn_x101_vd_64x4d_fpn_2x_coco.yml @@ -0,0 +1,28 @@ +_BASE_: [ + 'faster_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/faster_rcnn_x101_vd_64x4d_fpn_2x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + groups: 64 + base_width: 4 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/fcos/README.md b/configs/fcos/README.md new file mode 100644 index 0000000..27362f2 --- /dev/null +++ b/configs/fcos/README.md @@ -0,0 +1,31 @@ +# FCOS for Object Detection + +## Introduction + +FCOS (Fully Convolutional One-Stage Object Detection) is a fast anchor-free object detection framework with strong performance. We reproduced the model of the paper, and improved and optimized the accuracy of the FCOS. + +**Highlights:** + +- Training Time: The training time of the model of `fcos_r50_fpn_1x` on Tesla v100 with 8 GPU is only 8.5 hours. + +## Model Zoo + +| Backbone | Model | images/GPU | lr schedule |FPS | Box AP | download | config | +| :-------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| ResNet50-FPN | FCOS | 2 | 1x | ---- | 39.6 | [download](https://paddledet.bj.bcebos.com/models/fcos_r50_fpn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/fcos/fcos_r50_fpn_1x_coco.yml) | +| ResNet50-FPN | FCOS+DCN | 2 | 1x | ---- | 44.3 | [download](https://paddledet.bj.bcebos.com/models/fcos_dcn_r50_fpn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/fcos/fcos_dcn_r50_fpn_1x_coco.yml) | +| ResNet50-FPN | FCOS+multiscale_train | 2 | 2x | ---- | 41.8 | [download](https://paddledet.bj.bcebos.com/models/fcos_r50_fpn_multiscale_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/fcos/fcos_r50_fpn_multiscale_2x_coco.yml) | + +**Notes:** + +- FCOS is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95)`. + +## Citations +``` +@inproceedings{tian2019fcos, + title = {{FCOS}: Fully Convolutional One-Stage Object Detection}, + author = {Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, + booktitle = {Proc. Int. Conf. Computer Vision (ICCV)}, + year = {2019} +} +``` diff --git a/configs/fcos/_base_/fcos_r50_fpn.yml b/configs/fcos/_base_/fcos_r50_fpn.yml new file mode 100644 index 0000000..64a275d --- /dev/null +++ b/configs/fcos/_base_/fcos_r50_fpn.yml @@ -0,0 +1,55 @@ +architecture: FCOS +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +FCOS: + backbone: ResNet + neck: FPN + fcos_head: FCOSHead + fcos_post_process: FCOSPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + spatial_scales: [0.125, 0.0625, 0.03125] + extra_stage: 2 + has_extra_convs: true + use_c5: false + +FCOSHead: + fcos_feat: + name: FCOSFeat + feat_in: 256 + feat_out: 256 + num_convs: 4 + norm_type: "gn" + use_dcn: false + num_classes: 80 + fpn_stride: [8, 16, 32, 64, 128] + prior_prob: 0.01 + fcos_loss: FCOSLoss + norm_reg_targets: true + centerness_on_reg: true + +FCOSLoss: + loss_alpha: 0.25 + loss_gamma: 2.0 + iou_loss_type: "giou" + reg_weights: 1.0 + +FCOSPostProcess: + decode: + name: FCOSBox + num_classes: 80 + nms: + name: MultiClassNMS + nms_top_k: 1000 + keep_top_k: 100 + score_threshold: 0.025 + nms_threshold: 0.6 diff --git a/configs/fcos/_base_/fcos_reader.yml b/configs/fcos/_base_/fcos_reader.yml new file mode 100644 index 0000000..4aa343a --- /dev/null +++ b/configs/fcos/_base_/fcos_reader.yml @@ -0,0 +1,42 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Resize: {target_size: [800, 1333], keep_ratio: true, interp: 1} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 128} + - Gt2FCOSTarget: + object_sizes_boundary: [64, 128, 256, 512] + center_sampling_radius: 1.5 + downsample_ratios: [8, 16, 32, 64, 128] + norm_reg_targets: True + batch_size: 2 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 128} + batch_size: 1 + shuffle: false + + +TestReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 128} + batch_size: 1 + shuffle: false diff --git a/configs/fcos/_base_/optimizer_1x.yml b/configs/fcos/_base_/optimizer_1x.yml new file mode 100644 index 0000000..d28b094 --- /dev/null +++ b/configs/fcos/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/fcos/fcos_dcn_r50_fpn_1x_coco.yml b/configs/fcos/fcos_dcn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..fe45428 --- /dev/null +++ b/configs/fcos/fcos_dcn_r50_fpn_1x_coco.yml @@ -0,0 +1,32 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/fcos_r50_fpn.yml', + '_base_/optimizer_1x.yml', + '_base_/fcos_reader.yml', +] + +weights: output/fcos_dcn_r50_fpn_1x_coco/model_final + +ResNet: + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] + +FCOSHead: + fcos_feat: + name: FCOSFeat + feat_in: 256 + feat_out: 256 + num_convs: 4 + norm_type: "gn" + use_dcn: true + num_classes: 80 + fpn_stride: [8, 16, 32, 64, 128] + prior_prob: 0.01 + fcos_loss: FCOSLoss + norm_reg_targets: true + centerness_on_reg: true diff --git a/configs/fcos/fcos_r50_fpn_1x_coco.yml b/configs/fcos/fcos_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..0b47d45 --- /dev/null +++ b/configs/fcos/fcos_r50_fpn_1x_coco.yml @@ -0,0 +1,9 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/fcos_r50_fpn.yml', + '_base_/optimizer_1x.yml', + '_base_/fcos_reader.yml', +] + +weights: output/fcos_r50_fpn_1x_coco/model_final diff --git a/configs/fcos/fcos_r50_fpn_multiscale_2x_coco.yml b/configs/fcos/fcos_r50_fpn_multiscale_2x_coco.yml new file mode 100644 index 0000000..291f8d8 --- /dev/null +++ b/configs/fcos/fcos_r50_fpn_multiscale_2x_coco.yml @@ -0,0 +1,39 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/fcos_r50_fpn.yml', + '_base_/optimizer_1x.yml', + '_base_/fcos_reader.yml', +] + +weights: output/fcos_r50_fpn_multiscale_2x_coco/model_final + +TrainReader: + sample_transforms: + - Decode: {} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], keep_ratio: true, interp: 1} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 128} + - Gt2FCOSTarget: + object_sizes_boundary: [64, 128, 256, 512] + center_sampling_radius: 1.5 + downsample_ratios: [8, 16, 32, 64, 128] + norm_reg_targets: True + batch_size: 2 + shuffle: true + drop_last: true + +epoch: 24 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 diff --git a/configs/gn/README.md b/configs/gn/README.md new file mode 100644 index 0000000..cc398af --- /dev/null +++ b/configs/gn/README.md @@ -0,0 +1,23 @@ +# Group Normalization + +## Model Zoo + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | Mask AP | 下载 | 配置文件 | +| :------------- | :------------- | :-----------: | :------: | :--------: |:-----: | :-----: | :----: | :----: | +| ResNet50-FPN | Faster | 1 | 2x | - | 41.9 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_r50_fpn_gn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml) | +| ResNet50-FPN | Mask | 1 | 2x | - | 42.3 | 38.4 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_fpn_gn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml) | +| ResNet50-FPN | Cascade Faster | 1 | 2x | - | 44.6 | - | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_rcnn_r50_fpn_gn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml) | +| ResNet50-FPN | Cacade Mask | 1 | 2x | - | 45.0 | 39.3 | [下载链接](https://paddledet.bj.bcebos.com/models/cascade_mask_rcnn_r50_fpn_gn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml) | + + +**注意:** Faster R-CNN baseline仅使用 `2fc` head,而此处使用[`4conv1fc` head](https://arxiv.org/abs/1803.08494)(4层conv之间使用GN),并且FPN也使用GN,而对于Mask R-CNN是在mask head的4层conv之间也使用GN。 + +## Citations +``` +@inproceedings{wu2018group, + title={Group Normalization}, + author={Wu, Yuxin and He, Kaiming}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2018} +} +``` diff --git a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml new file mode 100644 index 0000000..e2c750d --- /dev/null +++ b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml @@ -0,0 +1,61 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '../cascade_rcnn/_base_/optimizer_1x.yml', + '../cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml', + '../cascade_rcnn/_base_/cascade_mask_fpn_reader.yml', +] +weights: output/cascade_mask_rcnn_r50_fpn_gn_2x_coco/model_final + +CascadeRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: CascadeHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +FPN: + out_channel: 256 + norm_type: gn + +CascadeHead: + head: CascadeXConvNormHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +CascadeXConvNormHead: + num_convs: 4 + out_channel: 1024 + norm_type: gn + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: False + +MaskFeat: + num_convs: 4 + out_channel: 256 + norm_type: gn + + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml new file mode 100644 index 0000000..2706790 --- /dev/null +++ b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml @@ -0,0 +1,37 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../cascade_rcnn/_base_/optimizer_1x.yml', + '../cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml', + '../cascade_rcnn/_base_/cascade_fpn_reader.yml', +] +weights: output/cascade_rcnn_r50_fpn_gn_2x_coco/model_final + +FPN: + out_channel: 256 + norm_type: gn + +CascadeHead: + head: CascadeXConvNormHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +CascadeXConvNormHead: + num_convs: 4 + out_channel: 1024 + norm_type: gn + + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml new file mode 100644 index 0000000..200a98b --- /dev/null +++ b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml @@ -0,0 +1,45 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../faster_rcnn/_base_/optimizer_1x.yml', + '../faster_rcnn/_base_/faster_rcnn_r50_fpn.yml', + '../faster_rcnn/_base_/faster_fpn_reader.yml', +] +weights: output/faster_rcnn_r50_fpn_gn_2x_coco/model_final + +FasterRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + # post process + bbox_post_process: BBoxPostProcess + +FPN: + out_channel: 256 + norm_type: gn + +BBoxHead: + head: XConvNormHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +XConvNormHead: + num_convs: 4 + out_channel: 1024 + norm_type: gn + + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml new file mode 100644 index 0000000..70beaf5 --- /dev/null +++ b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml @@ -0,0 +1,61 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '../mask_rcnn/_base_/optimizer_1x.yml', + '../mask_rcnn/_base_/mask_rcnn_r50_fpn.yml', + '../mask_rcnn/_base_/mask_fpn_reader.yml', +] +weights: output/mask_rcnn_r50_fpn_gn_2x_coco/model_final + +MaskRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +FPN: + out_channel: 256 + norm_type: gn + +BBoxHead: + head: XConvNormHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +XConvNormHead: + num_convs: 4 + out_channel: 1024 + norm_type: gn + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: False + +MaskFeat: + num_convs: 4 + out_channel: 256 + norm_type: gn + + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md new file mode 100644 index 0000000..9f581ab --- /dev/null +++ b/configs/hrnet/README.md @@ -0,0 +1,34 @@ +# High-resolution networks (HRNets) for object detection + +## Introduction + +- Deep High-Resolution Representation Learning for Human Pose Estimation: [https://arxiv.org/abs/1902.09212](https://arxiv.org/abs/1902.09212) + +``` +@inproceedings{SunXLW19, + title={Deep High-Resolution Representation Learning for Human Pose Estimation}, + author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang}, + booktitle={CVPR}, + year={2019} +} +``` + +- High-Resolution Representations for Labeling Pixels and Regions: [https://arxiv.org/abs/1904.04514](https://arxiv.org/abs/1904.04514) + +``` +@article{SunZJCXLMWLW19, + title={High-Resolution Representations for Labeling Pixels and Regions}, + author={Ke Sun and Yang Zhao and Borui Jiang and Tianheng Cheng and Bin Xiao + and Dong Liu and Yadong Mu and Xinggang Wang and Wenyu Liu and Jingdong Wang}, + journal = {CoRR}, + volume = {abs/1904.04514}, + year={2019} +} +``` + +## Model Zoo + +| Backbone | Type | Image/gpu | Lr schd | Inf time (fps) | Box AP | Mask AP | Download | Configs | +| :---------------------- | :------------- | :-------: | :-----: | :------------: | :----: | :-----: | :----------------------------------------------------------: | :-----: | +| HRNetV2p_W18 | Faster | 1 | 1x | - | 36.8 | - | [model](https://paddledet.bj.bcebos.com/models/faster_rcnn_hrnetv2p_w18_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.yml) | +| HRNetV2p_W18 | Faster | 1 | 2x | - | 39.0 | - | [model](https://paddledet.bj.bcebos.com/models/faster_rcnn_hrnetv2p_w18_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.yml) | diff --git a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml new file mode 100644 index 0000000..6c556f3 --- /dev/null +++ b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml @@ -0,0 +1,68 @@ +architecture: FasterRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/HRNet_W18_C_pretrained.pdparams + +FasterRCNN: + backbone: HRNet + neck: HRFPN + rpn_head: RPNHead + bbox_head: BBoxHead + # post process + bbox_post_process: BBoxPostProcess + +HRNet: + width: 18 + freeze_at: 0 + return_idx: [0, 1, 2, 3] + +HRFPN: + out_channel: 256 + share_conv: false + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +BBoxHead: + head: TwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +TwoFCHead: + out_channel: 1024 + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.yml b/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.yml new file mode 100644 index 0000000..6ff0596 --- /dev/null +++ b/configs/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco.yml @@ -0,0 +1,23 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + './_base_/faster_rcnn_hrnetv2p_w18.yml', + '../faster_rcnn/_base_/optimizer_1x.yml', + '../faster_rcnn/_base_/faster_fpn_reader.yml', + '../runtime.yml', +] + +weights: output/faster_rcnn_hrnetv2p_w18_1x_coco/model_final +epoch: 12 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +TrainReader: + batch_size: 2 diff --git a/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.yml b/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.yml new file mode 100644 index 0000000..73d9dc8 --- /dev/null +++ b/configs/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco.yml @@ -0,0 +1,23 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + './_base_/faster_rcnn_hrnetv2p_w18.yml', + '../faster_rcnn/_base_/optimizer_1x.yml', + '../faster_rcnn/_base_/faster_fpn_reader.yml', + '../runtime.yml', +] + +weights: output/faster_rcnn_hrnetv2p_w18_2x_coco/model_final +epoch: 24 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +TrainReader: + batch_size: 2 diff --git a/configs/mask_rcnn/README.md b/configs/mask_rcnn/README.md new file mode 100644 index 0000000..89f7f8e --- /dev/null +++ b/configs/mask_rcnn/README.md @@ -0,0 +1,31 @@ +# Mask R-CNN + +## Model Zoo + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | Mask AP | 下载 | 配置文件 | +| :------------------- | :------------| :-----: | :-----: | :------------: | :-----: | :-----: | :-----------------------------------------------------: | :-----: | +| ResNet50 | Mask | 1 | 1x | ---- | 37.4 | 32.8 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_1x_coco.yml) | +| ResNet50 | Mask | 1 | 2x | ---- | 39.7 | 34.5 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_2x_coco.yml) | +| ResNet50-FPN | Mask | 1 | 1x | ---- | 39.2 | 35.6 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml) | +| ResNet50-FPN | Mask | 1 | 2x | ---- | 40.5 | 36.7 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.yml) | +| ResNet50-vd-FPN | Mask | 1 | 1x | ---- | 40.3 | 36.4 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_1x_coco.yml) | +| ResNet50-vd-FPN | Mask | 1 | 2x | ---- | 41.4 | 37.5 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_2x_coco.yml) | +| ResNet101-FPN | Mask | 1 | 1x | ---- | 40.6 | 36.6 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r101_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.yml) | +| ResNet101-vd-FPN | Mask | 1 | 1x | ---- | 42.4 | 38.1 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r101_vd_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r101_vd_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Mask | 1 | 1x | ---- | 44.0 | 39.5 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.yml) | +| ResNeXt101-vd-FPN | Mask | 1 | 2x | ---- | 44.6 | 39.8 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_x101_vd_64x4d_fpn_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_2x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Mask | 1 | 1x | ---- | 42.0 | 38.2 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml) | +| ResNet50-vd-SSLDv2-FPN | Mask | 1 | 2x | ---- | 42.7 | 38.9 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_vd_fpn_ssld_2x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml) | + + +## Citations +``` +@article{He_2017, + title={Mask R-CNN}, + journal={2017 IEEE International Conference on Computer Vision (ICCV)}, + publisher={IEEE}, + author={He, Kaiming and Gkioxari, Georgia and Dollar, Piotr and Girshick, Ross}, + year={2017}, + month={Oct} +} +``` diff --git a/configs/mask_rcnn/_base_/mask_fpn_reader.yml b/configs/mask_rcnn/_base_/mask_fpn_reader.yml new file mode 100644 index 0000000..d2cb8ec --- /dev/null +++ b/configs/mask_rcnn/_base_/mask_fpn_reader.yml @@ -0,0 +1,39 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml new file mode 100644 index 0000000..04dab63 --- /dev/null +++ b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml @@ -0,0 +1,87 @@ +architecture: MaskRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +MaskRCNN: + backbone: ResNet + rpn_head: RPNHead + bbox_head: BBoxHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [2] + num_stages: 3 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [32, 64, 128, 256, 512] + strides: [16] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + topk_after_collect: False + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + + +BBoxHead: + head: Res5Head + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + with_pool: true + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: true + +MaskFeat: + num_convs: 0 + out_channel: 256 + +MaskAssigner: + mask_resolution: 14 + +MaskPostProcess: + binary_thresh: 0.5 diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml new file mode 100644 index 0000000..dd75876 --- /dev/null +++ b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml @@ -0,0 +1,91 @@ +architecture: MaskRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +MaskRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 1000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +BBoxHead: + head: TwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +TwoFCHead: + out_channel: 1024 + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: False + +MaskFeat: + num_convs: 4 + out_channel: 256 + +MaskAssigner: + mask_resolution: 28 + +MaskPostProcess: + binary_thresh: 0.5 diff --git a/configs/mask_rcnn/_base_/mask_reader.yml b/configs/mask_rcnn/_base_/mask_reader.yml new file mode 100644 index 0000000..b43d312 --- /dev/null +++ b/configs/mask_rcnn/_base_/mask_reader.yml @@ -0,0 +1,41 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1, pad_gt: true} + batch_size: 1 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: -1} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false diff --git a/configs/mask_rcnn/_base_/optimizer_1x.yml b/configs/mask_rcnn/_base_/optimizer_1x.yml new file mode 100644 index 0000000..63f898e --- /dev/null +++ b/configs/mask_rcnn/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.001 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.yml new file mode 100644 index 0000000..aae703c --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.yml @@ -0,0 +1,13 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_pretrained.pdparams +weights: output/mask_rcnn_r101_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/mask_rcnn/mask_rcnn_r101_vd_fpn_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r101_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..58d7a78 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r101_vd_fpn_1x_coco.yml @@ -0,0 +1,14 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_pretrained.pdparams +weights: output/mask_rcnn_r101_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/mask_rcnn/mask_rcnn_r50_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_1x_coco.yml new file mode 100644 index 0000000..01f4721 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/mask_rcnn_r50.yml', + '_base_/mask_reader.yml', +] +weights: output/mask_rcnn_r50_1x_coco/model_final diff --git a/configs/mask_rcnn/mask_rcnn_r50_2x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_2x_coco.yml new file mode 100644 index 0000000..f1e6b66 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_2x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'mask_rcnn_r50_1x_coco.yml', +] +weights: output/mask_rcnn_r50_2x_coco/model_final + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..95e48c2 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/mask_rcnn_r50_fpn.yml', + '_base_/mask_fpn_reader.yml', +] +weights: output/mask_rcnn_r50_fpn_1x_coco/model_final diff --git a/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.yml new file mode 100644 index 0000000..f687fd6 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] +weights: output/mask_rcnn_r50_fpn_2x_coco/model_final + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 diff --git a/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_1x_coco.yml new file mode 100644 index 0000000..d538741 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_1x_coco.yml @@ -0,0 +1,15 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/mask_rcnn_r50_vd_fpn_1x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 diff --git a/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_2x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_2x_coco.yml new file mode 100644 index 0000000..f85f029 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_2x_coco.yml @@ -0,0 +1,26 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_pretrained.pdparams +weights: output/mask_rcnn_r50_vd_fpn_2x_coco/model_final + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 diff --git a/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml new file mode 100644 index 0000000..c5718a8 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_1x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/mask_rcnn_r50_fpn.yml', + '_base_/mask_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/mask_rcnn_r50_vd_fpn_ssld_1x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 12 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml new file mode 100644 index 0000000..65b31e6 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_r50_vd_fpn_ssld_2x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/mask_rcnn_r50_fpn.yml', + '_base_/mask_fpn_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +weights: output/mask_rcnn_r50_vd_fpn_ssld_2x_coco/model_final + +ResNet: + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [12, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.yml b/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.yml new file mode 100644 index 0000000..2387502 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_1x_coco.yml @@ -0,0 +1,28 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/mask_rcnn_x101_vd_64x4d_fpn_1x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + variant: d + groups: 64 + base_width: 4 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 12 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_2x_coco.yml b/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_2x_coco.yml new file mode 100644 index 0000000..6a0d0f7 --- /dev/null +++ b/configs/mask_rcnn/mask_rcnn_x101_vd_64x4d_fpn_2x_coco.yml @@ -0,0 +1,28 @@ +_BASE_: [ + 'mask_rcnn_r50_fpn_1x_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNeXt101_vd_64x4d_pretrained.pdparams +weights: output/mask_rcnn_x101_vd_64x4d_fpn_2x_coco/model_final + +ResNet: + # for ResNeXt: groups, base_width, base_channels + depth: 101 + variant: d + groups: 64 + base_width: 4 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +epoch: 24 +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [16, 22] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 diff --git a/configs/pedestrian/README.md b/configs/pedestrian/README.md new file mode 100644 index 0000000..b5b9124 --- /dev/null +++ b/configs/pedestrian/README.md @@ -0,0 +1,50 @@ +English | [简体中文](README_cn.md) +# PaddleDetection applied for specific scenarios + +We provide some models implemented by PaddlePaddle to detect objects in specific scenarios, users can download the models and use them in these scenarios. + +| Task | Algorithm | Box AP | Download | Configs | +|:---------------------|:---------:|:------:| :-------------------------------------------------------------------------------------: |:------:| +| Pedestrian Detection | YOLOv3 | 51.8 | [model](https://paddledet.bj.bcebos.com/models/pedestrian_yolov3_darknet.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/pedestrian/pedestrian_yolov3_darknet.yml) | + +## Pedestrian Detection + +The main applications of pedetestrian detection include intelligent monitoring. In this scenary, photos of pedetestrians are taken by surveillance cameras in public areas, then pedestrian detection are conducted on these photos. + +### 1. Network + +The network for detecting vehicles is YOLOv3, the backbone of which is Dacknet53. + +### 2. Configuration for training + +PaddleDetection provides users with a configuration file [yolov3_darknet53_270e_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) to train YOLOv3 on the COCO dataset, compared with this file, we modify some parameters as followed to conduct the training for pedestrian detection: + +* num_classes: 1 +* dataset_dir: dataset/pedestrian + +### 3. Accuracy + +The accuracy of the model trained and evaluted on our private data is shown as followed: + +AP at IoU=.50:.05:.95 is 0.518. + +AP at IoU=.50 is 0.792. + +### 4. Inference + +Users can employ the model to conduct the inference: + +``` +export CUDA_VISIBLE_DEVICES=0 +python -u tools/infer.py -c configs/pedestrian/pedestrian_yolov3_darknet.yml \ + -o weights=https://paddledet.bj.bcebos.com/models/pedestrian_yolov3_darknet.pdparams \ + --infer_dir configs/pedestrian/demo \ + --draw_threshold 0.3 \ + --output_dir configs/pedestrian/demo/output +``` + +Some inference results are visualized below: + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/PedestrianDetection_001.png) + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/PedestrianDetection_004.png) diff --git a/configs/pedestrian/README_cn.md b/configs/pedestrian/README_cn.md new file mode 100644 index 0000000..3456670 --- /dev/null +++ b/configs/pedestrian/README_cn.md @@ -0,0 +1,51 @@ +[English](README.md) | 简体中文 +# 特色垂类检测模型 + +我们提供了针对不同场景的基于PaddlePaddle的检测模型,用户可以下载模型进行使用。 + +| 任务 | 算法 | 精度(Box AP) | 下载 | 配置文件 | +|:---------------------|:---------:|:------:| :---------------------------------------------------------------------------------: | :------:| +| 行人检测 | YOLOv3 | 51.8 | [下载链接](https://paddledet.bj.bcebos.com/models/pedestrian_yolov3_darknet.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/pedestrian/pedestrian_yolov3_darknet.yml) | + +## 行人检测(Pedestrian Detection) + +行人检测的主要应用有智能监控。在监控场景中,大多是从公共区域的监控摄像头视角拍摄行人,获取图像后再进行行人检测。 + +### 1. 模型结构 + +Backbone为Dacknet53的YOLOv3。 + + +### 2. 训练参数配置 + +PaddleDetection提供了使用COCO数据集对YOLOv3进行训练的参数配置文件[yolov3_darknet53_270e_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml),与之相比,在进行行人检测的模型训练时,我们对以下参数进行了修改: + +* num_classes: 1 +* dataset_dir: dataset/pedestrian + +### 2. 精度指标 + +模型在我们针对监控场景的内部数据上精度指标为: + +IOU=.5时的AP为 0.792。 + +IOU=.5-.95时的AP为 0.518。 + +### 3. 预测 + +用户可以使用我们训练好的模型进行行人检测: + +``` +export CUDA_VISIBLE_DEVICES=0 +python -u tools/infer.py -c configs/pedestrian/pedestrian_yolov3_darknet.yml \ + -o weights=https://paddledet.bj.bcebos.com/models/pedestrian_yolov3_darknet.pdparams \ + --infer_dir configs/pedestrian/demo \ + --draw_threshold 0.3 \ + --output_dir configs/pedestrian/demo/output +``` + +预测结果示例: + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/PedestrianDetection_001.png) + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/PedestrianDetection_004.png) diff --git a/configs/pedestrian/demo/001.png b/configs/pedestrian/demo/001.png new file mode 100644 index 0000000..63ae916 Binary files /dev/null and b/configs/pedestrian/demo/001.png differ diff --git a/configs/pedestrian/demo/002.png b/configs/pedestrian/demo/002.png new file mode 100644 index 0000000..0de905c Binary files /dev/null and b/configs/pedestrian/demo/002.png differ diff --git a/configs/pedestrian/demo/003.png b/configs/pedestrian/demo/003.png new file mode 100644 index 0000000..e9026e0 Binary files /dev/null and b/configs/pedestrian/demo/003.png differ diff --git a/configs/pedestrian/demo/004.png b/configs/pedestrian/demo/004.png new file mode 100644 index 0000000..d8118ec Binary files /dev/null and b/configs/pedestrian/demo/004.png differ diff --git a/configs/pedestrian/pedestrian.json b/configs/pedestrian/pedestrian.json new file mode 100644 index 0000000..f72fe6d --- /dev/null +++ b/configs/pedestrian/pedestrian.json @@ -0,0 +1,11 @@ +{ + "images": [], + "annotations": [], + "categories": [ + { + "supercategory": "component", + "id": 1, + "name": "pedestrian" + } + ] +} diff --git a/configs/pedestrian/pedestrian_yolov3_darknet.yml b/configs/pedestrian/pedestrian_yolov3_darknet.yml new file mode 100644 index 0000000..eb860dc --- /dev/null +++ b/configs/pedestrian/pedestrian_yolov3_darknet.yml @@ -0,0 +1,29 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../yolov3/_base_/optimizer_270e.yml', + '../yolov3/_base_/yolov3_darknet53.yml', + '../yolov3/_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: https://paddledet.bj.bcebos.com/models/pedestrian_yolov3_darknet.pdparams + +num_classes: 1 + +TrainDataset: + !COCODataSet + dataset_dir: dataset/pedestrian + anno_path: annotations/instances_train2017.json + image_dir: train2017 + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + dataset_dir: dataset/pedestrian + anno_path: annotations/instances_val2017.json + image_dir: val2017 + +TestDataset: + !ImageFolder + anno_path: configs/pedestrian/pedestrian.json diff --git a/configs/ppyolo/README.md b/configs/ppyolo/README.md new file mode 100644 index 0000000..3143229 --- /dev/null +++ b/configs/ppyolo/README.md @@ -0,0 +1,236 @@ +English | [简体中文](README_cn.md) + +# PP-YOLO + +## Table of Contents +- [Introduction](#Introduction) +- [Model Zoo](#Model_Zoo) +- [Getting Start](#Getting_Start) +- [Future Work](#Future_Work) +- [Appendix](#Appendix) + +## Introduction + +[PP-YOLO](https://arxiv.org/abs/2007.12099) is a optimized model based on YOLOv3 in PaddleDetection,whose performance(mAP on COCO) and inference speed are better than [YOLOv4](https://arxiv.org/abs/2004.10934),PaddlePaddle 2.0.0rc1(available on pip now) or [Daily Version](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#whl-release) is required to run this PP-YOLO。 + +PP-YOLO reached mAP(IoU=0.5:0.95) as 45.9% on COCO test-dev2017 dataset, and inference speed of FP32 on single V100 is 72.9 FPS, inference speed of FP16 with TensorRT on single V100 is 155.6 FPS. + +
+ +
+ +PP-YOLO improved performance and speed of YOLOv3 with following methods: + +- Better backbone: ResNet50vd-DCN +- Larger training batch size: 8 GPUs and mini-batch size as 24 on each GPU +- [Drop Block](https://arxiv.org/abs/1810.12890) +- [Exponential Moving Average](https://www.investopedia.com/terms/e/ema.asp) +- [IoU Loss](https://arxiv.org/pdf/1902.09630.pdf) +- [Grid Sensitive](https://arxiv.org/abs/2004.10934) +- [Matrix NMS](https://arxiv.org/pdf/2003.10152.pdf) +- [CoordConv](https://arxiv.org/abs/1807.03247) +- [Spatial Pyramid Pooling](https://arxiv.org/abs/1406.4729) +- Better ImageNet pretrain weights + +## Model Zoo + +### PP-YOLO + +| Model | GPU number | images/GPU | backbone | input shape | Box APval | Box APtest | V100 FP32(FPS) | V100 TensorRT FP16(FPS) | download | config | +|:------------------------:|:-------:|:-------------:|:----------:| :-------:| :------------------: | :-------------------: | :------------: | :---------------------: | :------: | :------: | +| PP-YOLO | 8 | 24 | ResNet50vd | 608 | 44.8 | 45.2 | 72.9 | 155.6 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 512 | 43.9 | 44.4 | 89.9 | 188.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 416 | 42.1 | 42.5 | 109.1 | 215.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 320 | 38.9 | 39.3 | 132.2 | 242.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 608 | 45.3 | 45.9 | 72.9 | 155.6 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 512 | 44.4 | 45.0 | 89.9 | 188.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 416 | 42.7 | 43.2 | 109.1 | 215.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 320 | 39.5 | 40.1 | 132.2 | 242.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 512 | 29.2 | 29.5 | 357.1 | 657.9 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 416 | 28.6 | 28.9 | 409.8 | 719.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 320 | 26.2 | 26.4 | 480.7 | 763.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLOv2 | 8 | 12 | ResNet50vd | 640 | 49.1 | 49.5 | 68.9 | 106.5 | [model](https://paddledet.bj.bcebos.com/models/ppyolov2_r50vd_dcn_365e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml) | +| PP-YOLOv2 | 8 | 12 | ResNet101vd | 640 | 49.7 | 50.3 | 49.5 | 87.0 | [model](https://paddledet.bj.bcebos.com/models/ppyolov2_r101vd_dcn_365e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml) | + + +**Notes:** + +- PP-YOLO is trained on COCO train2017 dataset and evaluated on val2017 & test-dev2017 dataset,Box APtest is evaluation results of `mAP(IoU=0.5:0.95)`. +- PP-YOLO used 8 GPUs for training and mini-batch size as 24 on each GPU, if GPU number and mini-batch size is changed, learning rate and iteration times should be adjusted according [FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md). +- PP-YOLO inference speed is tesed on single Tesla V100 with batch size as 1, CUDA 10.2, CUDNN 7.5.1, TensorRT 5.1.2.2 in TensorRT mode. +- PP-YOLO FP32 inference speed testing uses inference model exported by `tools/export_model.py` and benchmarked by running `depoly/python/infer.py` with `--run_benchmark`. All testing results do not contains the time cost of data reading and post-processing(NMS), which is same as [YOLOv4(AlexyAB)](https://github.com/AlexeyAB/darknet) in testing method. +- TensorRT FP16 inference speed testing exclude the time cost of bounding-box decoding(`yolo_box`) part comparing with FP32 testing above, which means that data reading, bounding-box decoding and post-processing(NMS) is excluded(test method same as [YOLOv4(AlexyAB)](https://github.com/AlexeyAB/darknet) too) + +### PP-YOLO for mobile + +| Model | GPU number | images/GPU | Model Size | input shape | Box APval | Box AP50val | Kirin 990 1xCore(FPS) | download | config | +|:----------------------------:|:-------:|:-------------:|:----------:| :-------:| :------------------: | :--------------------: | :--------------------: | :------: | :------: | +| PP-YOLO_MobileNetV3_large | 4 | 32 | 28MB | 320 | 23.2 | 42.6 | 14.1 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_large_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_mbv3_large_coco.yml) | +| PP-YOLO_MobileNetV3_small | 4 | 32 | 16MB | 320 | 17.2 | 33.8 | 21.5 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_small_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_mbv3_small_coco.yml) | + +**Notes:** + +- PP-YOLO_MobileNetV3 is trained on COCO train2017 datast and evaluated on val2017 dataset,Box APval is evaluation results of `mAP(IoU=0.5:0.95)`, Box AP50val is evaluation results of `mAP(IoU=0.5)`. +- PP-YOLO_MobileNetV3 used 4 GPUs for training and mini-batch size as 32 on each GPU, if GPU number and mini-batch size is changed, learning rate and iteration times should be adjusted according [FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md). +- PP-YOLO_MobileNetV3 inference speed is tested on Kirin 990 with 1 thread. + +### PP-YOLO tiny + +| Model | GPU number | images/GPU | Model Size | Post Quant Model Size | input shape | Box APval | Kirin 990 4xCore(FPS) | download | config | post quant model | +|:----------------------------:|:-------:|:-------------:|:----------:| :-------------------: | :---------: | :------------------: | :-------------------: | :------: | :----: | :--------------: | +| PP-YOLO tiny | 8 | 32 | 4.2MB | **1.3M** | 320 | 20.6 | 92.3 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_650e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_tiny_650e_coco.yml) | [inference model](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_quant.tar) | +| PP-YOLO tiny | 8 | 32 | 4.2MB | **1.3M** | 416 | 22.7 | 65.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_650e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_tiny_650e_coco.yml) | [inference model](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_quant.tar) | + +**Notes:** + +- PP-YOLO-tiny is trained on COCO train2017 datast and evaluated on val2017 dataset,Box APval is evaluation results of `mAP(IoU=0.5:0.95)`. +- PP-YOLO-tiny used 8 GPUs for training and mini-batch size as 32 on each GPU, if GPU number and mini-batch size is changed, learning rate and iteration times should be adjusted according [FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md). +- PP-YOLO-tiny inference speed is tested on Kirin 990 with 4 threads by arm8 +- we alse provide PP-YOLO-tiny post quant inference model, which can compress model to **1.3MB** with nearly no inference on inference speed and performance + +### PP-YOLO on Pascal VOC + +PP-YOLO trained on Pascal VOC dataset as follows: + +| Model | GPU number | images/GPU | backbone | input shape | Box AP50val | download | config | +|:------------------:|:----------:|:----------:|:----------:| :----------:| :--------------------: | :------: | :-----: | +| PP-YOLO | 8 | 12 | ResNet50vd | 608 | 84.9 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | +| PP-YOLO | 8 | 12 | ResNet50vd | 416 | 84.3 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | +| PP-YOLO | 8 | 12 | ResNet50vd | 320 | 82.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | + +## Getting Start + +### 1. Training + +Training PP-YOLO on 8 GPUs with following command(all commands should be run under PaddleDetection dygraph directory as default) + +```bash +python -m paddle.distributed.launch --log_dir=./ppyolo_dygraph/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml &>ppyolo_dygraph.log 2>&1 & +``` + +optional: Run `tools/anchor_cluster.py` to get anchors suitable for your dataset, and modify the anchor setting in model configuration file and reader configuration file, such as `configs/ppyolo/_base_/ppyolo_tiny.yml` and `configs/ppyolo/_base_/ppyolo_tiny_reader.yml`. + +``` bash +python tools/anchor_cluster.py -c configs/ppyolo/ppyolo_tiny_650e_coco.yml -n 9 -s 320 -m v2 -i 1000 +``` + +### 2. Evaluation + +Evaluating PP-YOLO on COCO val2017 dataset in single GPU with following commands: + +```bash +# use weights released in PaddleDetection model zoo +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# use saved checkpoint in training +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=output/ppyolo_r50vd_dcn_1x_coco/model_final +``` + +For evaluation on COCO test-dev2017 dataset, `configs/ppyolo/ppyolo_test.yml` should be used, please download COCO test-dev2017 dataset from [COCO dataset download](https://cocodataset.org/#download) and decompress to pathes configured by `EvalReader.dataset` in `configs/ppyolo/ppyolo_test.yml` and run evaluation by following command: + +```bash +# use weights released in PaddleDetection model zoo +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_test.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# use saved checkpoint in training +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_test.yml -o weights=output/ppyolo_r50vd_dcn_1x_coco/model_final +``` + +Evaluation results will be saved in `bbox.json`, compress it into a `zip` package and upload to [COCO dataset evaluation](https://competitions.codalab.org/competitions/20794#participate) to evaluate. + +**NOTE 1:** `configs/ppyolo/ppyolo_test.yml` is only used for evaluation on COCO test-dev2017 dataset, could not be used for training or COCO val2017 dataset evaluating. + +**NOTE 2:** Due to the overall upgrade of the dynamic graph framework, the following weight models published by paddledetection need to be evaluated by adding the -- bias field, such as + +```bash +# use weights released in PaddleDetection model zoo +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --bias +``` +These models are: + +1.ppyolo_r50vd_dcn_1x_coco + +2.ppyolo_r50vd_dcn_voc + +3.ppyolo_r18vd_coco + +4.ppyolo_mbv3_large_coco + +5.ppyolo_mbv3_small_coco + +6.ppyolo_tiny_650e_coco + +### 3. Inference + +Inference images in single GPU with following commands, use `--infer_img` to inference a single image and `--infer_dir` to inference all images in the directory. + +```bash +# inference single image +CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439_640x640.jpg + +# inference all images in the directory +CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_dir=demo +``` + +### 4. Inferece deployment + +For inference deployment or benchmard, model exported with `tools/export_model.py` should be used and perform inference with Paddle inference library with following commands: + +```bash +# export model, model will be save in output/ppyolo as default +python tools/export_model.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# inference with Paddle Inference library +CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/ppyolo_r50vd_dcn_1x_coco --image_file=demo/000000014439_640x640.jpg --use_gpu=True +``` + + +## Future work + +1. more PP-YOLO tiny model +2. PP-YOLO model with more backbones + +## Appendix + +Optimizing method and ablation experiments of PP-YOLO compared with YOLOv3. + +| NO. | Model | Box APval | Box APtest | Params(M) | FLOPs(G) | V100 FP32 FPS | +| :--: | :--------------------------- | :------------------: |:--------------------: | :-------: | :------: | :-----------: | +| A | YOLOv3-DarkNet53 | 38.9 | - | 59.13 | 65.52 | 58.2 | +| B | YOLOv3-ResNet50vd-DCN | 39.1 | - | 43.89 | 44.71 | 79.2 | +| C | B + LB + EMA + DropBlock | 41.4 | - | 43.89 | 44.71 | 79.2 | +| D | C + IoU Loss | 41.9 | - | 43.89 | 44.71 | 79.2 | +| E | D + IoU Aware | 42.5 | - | 43.90 | 44.71 | 74.9 | +| F | E + Grid Sensitive | 42.8 | - | 43.90 | 44.71 | 74.8 | +| G | F + Matrix NMS | 43.5 | - | 43.90 | 44.71 | 74.8 | +| H | G + CoordConv | 44.0 | - | 43.93 | 44.76 | 74.1 | +| I | H + SPP | 44.3 | 45.2 | 44.93 | 45.12 | 72.9 | +| J | I + Better ImageNet Pretrain | 44.8 | 45.2 | 44.93 | 45.12 | 72.9 | +| K | J + 2x Scheduler | 45.3 | 45.9 | 44.93 | 45.12 | 72.9 | + +**Notes:** + +- Performance and inference spedd are measure with input shape as 608 +- All models are trained on COCO train2017 datast and evaluated on val2017 & test-dev2017 dataset,`Box AP` is evaluation results as `mAP(IoU=0.5:0.95)`. +- Inference speed is tested on single Tesla V100 with batch size as 1 following test method and environment configuration in benchmark above. +- [YOLOv3-DarkNet53](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) with mAP as 39.0 is optimized YOLOv3 model in PaddleDetection,see [Model Zoo](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/docs/MODEL_ZOO_cn.md) for details. + + +## Citation + +``` +@misc{long2020ppyolo, +title={PP-YOLO: An Effective and Efficient Implementation of Object Detector}, +author={Xiang Long and Kaipeng Deng and Guanzhong Wang and Yang Zhang and Qingqing Dang and Yuan Gao and Hui Shen and Jianguo Ren and Shumin Han and Errui Ding and Shilei Wen}, +year={2020}, +eprint={2007.12099}, +archivePrefix={arXiv}, +primaryClass={cs.CV} +} +@misc{ppdet2019, +title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.}, +author={PaddlePaddle Authors}, +howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}}, +year={2019} +} +``` diff --git a/configs/ppyolo/README_cn.md b/configs/ppyolo/README_cn.md new file mode 100644 index 0000000..4e7c7bc --- /dev/null +++ b/configs/ppyolo/README_cn.md @@ -0,0 +1,231 @@ +简体中文 | [English](README.md) + +# PP-YOLO 模型 + +## 内容 +- [简介](#简介) +- [模型库与基线](#模型库与基线) +- [使用说明](#使用说明) +- [未来工作](#未来工作) +- [附录](#附录) + +## 简介 + +[PP-YOLO](https://arxiv.org/abs/2007.12099)是PaddleDetection优化和改进的YOLOv3的模型,其精度(COCO数据集mAP)和推理速度均优于[YOLOv4](https://arxiv.org/abs/2004.10934)模型,要求使用PaddlePaddle 2.0.0rc1(可使用pip安装) 或适当的[develop版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/Tables.html#whl-release)。 + +PP-YOLO在[COCO](http://cocodataset.org) test-dev2017数据集上精度达到45.9%,在单卡V100上FP32推理速度为72.9 FPS, V100上开启TensorRT下FP16推理速度为155.6 FPS。 + +
+ +
+ +PP-YOLO从如下方面优化和提升YOLOv3模型的精度和速度: + +- 更优的骨干网络: ResNet50vd-DCN +- 更大的训练batch size: 8 GPUs,每GPU batch_size=24,对应调整学习率和迭代轮数 +- [Drop Block](https://arxiv.org/abs/1810.12890) +- [Exponential Moving Average](https://www.investopedia.com/terms/e/ema.asp) +- [IoU Loss](https://arxiv.org/pdf/1902.09630.pdf) +- [Grid Sensitive](https://arxiv.org/abs/2004.10934) +- [Matrix NMS](https://arxiv.org/pdf/2003.10152.pdf) +- [CoordConv](https://arxiv.org/abs/1807.03247) +- [Spatial Pyramid Pooling](https://arxiv.org/abs/1406.4729) +- 更优的预训练模型 + +## 模型库 + +### PP-YOLO模型 + +| 模型 | GPU个数 | 每GPU图片个数 | 骨干网络 | 输入尺寸 | Box APval | Box APtest | V100 FP32(FPS) | V100 TensorRT FP16(FPS) | 模型下载 | 配置文件 | +|:------------------------:|:-------:|:-------------:|:----------:| :-------:| :------------------: | :-------------------: | :------------: | :---------------------: | :------: | :------: | +| PP-YOLO | 8 | 24 | ResNet50vd | 608 | 44.8 | 45.2 | 72.9 | 155.6 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 512 | 43.9 | 44.4 | 89.9 | 188.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 416 | 42.1 | 42.5 | 109.1 | 215.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO | 8 | 24 | ResNet50vd | 320 | 38.9 | 39.3 | 132.2 | 242.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 608 | 45.3 | 45.9 | 72.9 | 155.6 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 512 | 44.4 | 45.0 | 89.9 | 188.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 416 | 42.7 | 43.2 | 109.1 | 215.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO_2x | 8 | 24 | ResNet50vd | 320 | 39.5 | 40.1 | 132.2 | 242.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_2x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 512 | 29.2 | 29.5 | 357.1 | 657.9 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 416 | 28.6 | 28.9 | 409.8 | 719.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLO | 4 | 32 | ResNet18vd | 320 | 26.2 | 26.4 | 480.7 | 763.4 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r18vd_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r18vd_coco.yml) | +| PP-YOLOv2 | 8 | 12 | ResNet50vd | 640 | 49.1 | 49.5 | 68.9 | 106.5 | [model](https://paddledet.bj.bcebos.com/models/ppyolov2_r50vd_dcn_365e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml) | +| PP-YOLOv2 | 8 | 12 | ResNet101vd | 640 | 49.7 | 50.3 | 49.5 | 87.0 | [model](https://paddledet.bj.bcebos.com/models/ppyolov2_r101vd_dcn_365e_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml) | + +**注意:** + +- PP-YOLO模型使用COCO数据集中train2017作为训练集,使用val2017和test-dev2017作为测试集,Box APtest为`mAP(IoU=0.5:0.95)`评估结果。 +- PP-YOLO模型训练过程中使用8 GPUs,每GPU batch size为24进行训练,如训练GPU数和batch size不使用上述配置,须参考[FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md)调整学习率和迭代次数。 +- PP-YOLO模型推理速度测试采用单卡V100,batch size=1进行测试,使用CUDA 10.2, CUDNN 7.5.1,TensorRT推理速度测试使用TensorRT 5.1.2.2。 +- PP-YOLO模型FP32的推理速度测试数据为使用`tools/export_model.py`脚本导出模型后,使用`deploy/python/infer.py`脚本中的`--run_benchnark`参数使用Paddle预测库进行推理速度benchmark测试结果, 且测试的均为不包含数据预处理和模型输出后处理(NMS)的数据(与[YOLOv4(AlexyAB)](https://github.com/AlexeyAB/darknet)测试方法一致)。 +- TensorRT FP16的速度测试相比于FP32去除了`yolo_box`(bbox解码)部分耗时,即不包含数据预处理,bbox解码和NMS(与[YOLOv4(AlexyAB)](https://github.com/AlexeyAB/darknet)测试方法一致)。 + +### PP-YOLO 轻量级模型 + +| 模型 | GPU个数 | 每GPU图片个数 | 模型体积 | 输入尺寸 | Box APval | Box AP50val | Kirin 990 1xCore (FPS) | 模型下载 | 配置文件 | +|:----------------------------:|:-------:|:-------------:|:----------:| :-------:| :------------------: | :--------------------: | :--------------------: | :------: | :------: | +| PP-YOLO_MobileNetV3_large | 4 | 32 | 28MB | 320 | 23.2 | 42.6 | 14.1 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_large_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_mbv3_large_coco.yml) | +| PP-YOLO_MobileNetV3_small | 4 | 32 | 16MB | 320 | 17.2 | 33.8 | 21.5 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyolo_mbv3_small_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_mbv3_small_coco.yml) | + +- PP-YOLO_MobileNetV3 模型使用COCO数据集中train2017作为训练集,使用val2017作为测试集,Box APval为`mAP(IoU=0.5:0.95)`评估结果, Box AP50val为`mAP(IoU=0.5)`评估结果。 +- PP-YOLO_MobileNetV3 模型训练过程中使用4GPU,每GPU batch size为32进行训练,如训练GPU数和batch size不使用上述配置,须参考[FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md)调整学习率和迭代次数。 +- PP-YOLO_MobileNetV3 模型推理速度测试环境配置为麒麟990芯片单线程。 + +### PP-YOLO tiny + +| 模型 | GPU个数 | 每GPU图片个数 | 模型体积 | 量化后模型体积 | 输入尺寸 | Box APval | Kirin 990 4xCore(FPS) | 模型下载 | 配置文件 | 量化后模型下载 | +|:---------:|:-------:|:---------:|:---------:| :-------------------: | :---------: | :------------------: | :-------------------: | :------: | :----: | :--------------: | +| PP-YOLO tiny | 8 | 32 | 4.2MB | **1.3M** | 320 | 20.6 | 92.3 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_650e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_tiny_650e_coco.yml) | [推理模型](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_quant.tar) | +| PP-YOLO tiny | 8 | 32 | 4.2MB | **1.3M** | 416 | 22.7 | 65.4 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_650e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_tiny_650e_coco.yml) | [推理模型](https://paddledet.bj.bcebos.com/models/ppyolo_tiny_quant.tar) | + +**注意:** + +- PP-YOLO-tiny 在COCO train2017数据集上进行训练,在val2017数据集上进行评估,Box APval 是`mAP(IoU=0.5:0.95)`的评估结果。 +- PP-YOLO-tiny 使用8个GPU进行训练,每个GPU上的batch size为32,如果GPU数量和最小批量大小发生变化,则应根据[FAQ](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/static/docs/FAQ.md)调整学习速率和迭代次数。 +- PP-YOLO-tiny 是利用arm8在Kirin 990上4个线程来测试推理速度的。 +- 我们还提供了PP-YOLO-tiny 量化后的推理模型, 它可以将模型压缩到**1.3MB**,并且几乎不需要对推理速度和性能进行任何推理。 + +### Pascal VOC数据集上的PP-YOLO + +PP-YOLO在Pascal VOC数据集上训练模型如下: + +| 模型 | GPU个数 | 每GPU图片个数 | 骨干网络 | 输入尺寸 | Box AP50val | 模型下载 | 配置文件 | +|:------------------:|:-------:|:-------------:|:----------:| :----------:| :--------------------: | :------: | :-----: | +| PP-YOLO | 8 | 12 | ResNet50vd | 608 | 84.9 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | +| PP-YOLO | 8 | 12 | ResNet50vd | 416 | 84.3 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | +| PP-YOLO | 8 | 12 | ResNet50vd | 320 | 82.2 | [model](https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_voc.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml) | + +## 使用说明 + +### 1. 训练 + +使用8GPU通过如下命令一键式启动训练(以下命令均默认在PaddleDetection根目录运行), 通过`--eval`参数开启训练中交替评估。 + +```bash +python -m paddle.distributed.launch --log_dir=./ppyolo_dygraph/ --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml &>ppyolo_dygraph.log 2>&1 & +``` + +可选:在训练之前使用`tools/anchor_cluster.py`得到适用于你的数据集的anchor,并注意修改模型配置文件和Reader配置文件中的anchor设置,如`configs/ppyolo/_base_/ppyolo_tiny.yml`和`configs/ppyolo/_base_/ppyolo_tiny_reader.yml`中anchor设置 +```bash +python tools/anchor_cluster.py -c configs/ppyolo/ppyolo_tiny_650e_coco.yml -n 9 -s 320 -m v2 -i 1000 +``` + +### 2. 评估 + +使用单GPU通过如下命令一键式评估模型在COCO val2017数据集效果 + +```bash +# 使用PaddleDetection发布的权重 +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# 使用训练保存的checkpoint +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=output/ppyolo_r50vd_dcn_1x_coco/model_final +``` + +我们提供了`configs/ppyolo/ppyolo_test.yml`用于评估COCO test-dev2017数据集的效果,评估COCO test-dev2017数据集的效果须先从[COCO数据集下载页](https://cocodataset.org/#download)下载test-dev2017数据集,解压到`configs/ppyolo/ppyolo_test.yml`中`EvalReader.dataset`中配置的路径,并使用如下命令进行评估 + +```bash +# 使用PaddleDetection发布的权重 +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_test.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# 使用训练保存的checkpoint +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_test.yml -o weights=output/ppyolo_r50vd_dcn_1x_coco/model_final +``` + +评估结果保存于`bbox.json`中,将其压缩为zip包后通过[COCO数据集评估页](https://competitions.codalab.org/competitions/20794#participate)提交评估。 + +**注意1:** `configs/ppyolo/ppyolo_test.yml`仅用于评估COCO test-dev数据集,不用于训练和评估COCO val2017数据集。 + +**注意2:** 由于动态图框架整体升级,以下几个PaddleDetection发布的权重模型评估时需要添加--bias字段, 例如 + +```bash +# 使用PaddleDetection发布的权重 +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --bias +``` +主要有: + +1.ppyolo_r50vd_dcn_1x_coco + +2.ppyolo_r50vd_dcn_voc + +3.ppyolo_r18vd_coco + +4.ppyolo_mbv3_large_coco + +5.ppyolo_mbv3_small_coco + +6.ppyolo_tiny_650e_coco + +### 3. 推理 + +使用单GPU通过如下命令一键式推理图像,通过`--infer_img`指定图像路径,或通过`--infer_dir`指定目录并推理目录下所有图像 + +```bash +# 推理单张图像 +CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_img=demo/000000014439_640x640.jpg + +# 推理目录下所有图像 +CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams --infer_dir=demo +``` + +### 4. 推理部署 + +PP-YOLO模型部署及推理benchmark需要通过`tools/export_model.py`导出模型后使用Paddle预测库进行部署和推理,可通过如下命令一键式启动。 + +```bash +# 导出模型,默认存储于output/ppyolo目录 +python tools/export_model.py -c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/ppyolo_r50vd_dcn_1x_coco.pdparams + +# 预测库推理 +CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/ppyolo_r50vd_dcn_1x_coco --image_file=demo/000000014439_640x640.jpg --use_gpu=True +``` + + +## 未来工作 + +1. 发布PP-YOLO-tiny模型 +2. 发布更多骨干网络的PP-YOLO模型 + +## 附录 + +PP-YOLO模型相对于YOLOv3模型优化项消融实验数据如下表所示。 + +| 序号 | 模型 | Box APval | Box APtest | 参数量(M) | FLOPs(G) | V100 FP32 FPS | +| :--: | :--------------------------- | :------------------: | :-------------------: | :-------: | :------: | :-----------: | +| A | YOLOv3-DarkNet53 | 38.9 | - | 59.13 | 65.52 | 58.2 | +| B | YOLOv3-ResNet50vd-DCN | 39.1 | - | 43.89 | 44.71 | 79.2 | +| C | B + LB + EMA + DropBlock | 41.4 | - | 43.89 | 44.71 | 79.2 | +| D | C + IoU Loss | 41.9 | - | 43.89 | 44.71 | 79.2 | +| E | D + IoU Aware | 42.5 | - | 43.90 | 44.71 | 74.9 | +| F | E + Grid Sensitive | 42.8 | - | 43.90 | 44.71 | 74.8 | +| G | F + Matrix NMS | 43.5 | - | 43.90 | 44.71 | 74.8 | +| H | G + CoordConv | 44.0 | - | 43.93 | 44.76 | 74.1 | +| I | H + SPP | 44.3 | 45.2 | 44.93 | 45.12 | 72.9 | +| J | I + Better ImageNet Pretrain | 44.8 | 45.2 | 44.93 | 45.12 | 72.9 | +| K | J + 2x Scheduler | 45.3 | 45.9 | 44.93 | 45.12 | 72.9 | + +**注意:** + +- 精度与推理速度数据均为使用输入图像尺寸为608的测试结果 +- Box AP为在COCO train2017数据集训练,val2017和test-dev2017数据集上评估`mAP(IoU=0.5:0.95)`数据 +- 推理速度为单卡V100上,batch size=1, 使用上述benchmark测试方法的测试结果,测试环境配置为CUDA 10.2,CUDNN 7.5.1 +- [YOLOv3-DarkNet53](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml)精度38.9为PaddleDetection优化后的YOLOv3模型,可参见[模型库](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/docs/MODEL_ZOO_cn.md) + +## 引用 + +``` +@misc{long2020ppyolo, +title={PP-YOLO: An Effective and Efficient Implementation of Object Detector}, +author={Xiang Long and Kaipeng Deng and Guanzhong Wang and Yang Zhang and Qingqing Dang and Yuan Gao and Hui Shen and Jianguo Ren and Shumin Han and Errui Ding and Shilei Wen}, +year={2020}, +eprint={2007.12099}, +archivePrefix={arXiv}, +primaryClass={cs.CV} +} +@misc{ppdet2019, +title={PaddleDetection, Object detection and instance segmentation toolkit based on PaddlePaddle.}, +author={PaddlePaddle Authors}, +howpublished = {\url{https://github.com/PaddlePaddle/PaddleDetection}}, +year={2019} +} +``` diff --git a/configs/ppyolo/_base_/optimizer_1x.yml b/configs/ppyolo/_base_/optimizer_1x.yml new file mode 100644 index 0000000..8e6301e --- /dev/null +++ b/configs/ppyolo/_base_/optimizer_1x.yml @@ -0,0 +1,21 @@ +epoch: 405 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 243 + - 324 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/_base_/optimizer_2x.yml b/configs/ppyolo/_base_/optimizer_2x.yml new file mode 100644 index 0000000..92ddbf2 --- /dev/null +++ b/configs/ppyolo/_base_/optimizer_2x.yml @@ -0,0 +1,21 @@ +epoch: 811 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 649 + - 730 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/_base_/optimizer_365e.yml b/configs/ppyolo/_base_/optimizer_365e.yml new file mode 100644 index 0000000..d834a4c --- /dev/null +++ b/configs/ppyolo/_base_/optimizer_365e.yml @@ -0,0 +1,21 @@ +epoch: 365 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 243 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + clip_grad_by_norm: 35. + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/_base_/optimizer_650e.yml b/configs/ppyolo/_base_/optimizer_650e.yml new file mode 100644 index 0000000..79a1f98 --- /dev/null +++ b/configs/ppyolo/_base_/optimizer_650e.yml @@ -0,0 +1,22 @@ +epoch: 650 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 430 + - 540 + - 610 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/_base_/ppyolo_mbv3_large.yml b/configs/ppyolo/_base_/ppyolo_mbv3_large.yml new file mode 100644 index 0000000..0faaa9a --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_mbv3_large.yml @@ -0,0 +1,56 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: MobileNetV3 + neck: PPYOLOFPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNetV3: + model_name: large + scale: 1. + with_extra_blocks: false + extra_block_filters: [] + feature_maps: [13, 16] + +PPYOLOFPN: + in_channels: [160, 368] + coord_conv: true + conv_block_num: 0 + spp: true + drop_block: true + +YOLOv3Head: + anchors: [[11, 18], [34, 47], [51, 126], + [115, 71], [120, 195], [254, 235]] + anchor_masks: [[3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.5 + downsample: [32, 16] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MultiClassNMS + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + score_threshold: 0.005 diff --git a/configs/ppyolo/_base_/ppyolo_mbv3_small.yml b/configs/ppyolo/_base_/ppyolo_mbv3_small.yml new file mode 100644 index 0000000..dda9382 --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_mbv3_small.yml @@ -0,0 +1,56 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_small_x1_0_ssld_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: MobileNetV3 + neck: PPYOLOFPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNetV3: + model_name: small + scale: 1. + with_extra_blocks: false + extra_block_filters: [] + feature_maps: [9, 12] + +PPYOLOFPN: + in_channels: [96, 304] + coord_conv: true + conv_block_num: 0 + spp: true + drop_block: true + +YOLOv3Head: + anchors: [[11, 18], [34, 47], [51, 126], + [115, 71], [120, 195], [254, 235]] + anchor_masks: [[3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.5 + downsample: [32, 16] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MultiClassNMS + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + score_threshold: 0.005 diff --git a/configs/ppyolo/_base_/ppyolo_r18vd.yml b/configs/ppyolo/_base_/ppyolo_r18vd.yml new file mode 100644 index 0000000..56a3483 --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_r18vd.yml @@ -0,0 +1,57 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: ResNet + neck: PPYOLOFPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +ResNet: + depth: 18 + variant: d + return_idx: [2, 3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +PPYOLOFPN: + drop_block: true + block_size: 3 + keep_prob: 0.9 + conv_block_num: 0 + +YOLOv3Head: + anchor_masks: [[3, 4, 5], [0, 1, 2]] + anchors: [[10, 14], [23, 27], [37, 58], + [81, 82], [135, 169], [344, 319]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.01 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MatrixNMS + keep_top_k: 100 + score_threshold: 0.01 + post_threshold: 0.01 + nms_top_k: -1 + background_label: -1 diff --git a/configs/ppyolo/_base_/ppyolo_r50vd_dcn.yml b/configs/ppyolo/_base_/ppyolo_r50vd_dcn.yml new file mode 100644 index 0000000..22cad95 --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_r50vd_dcn.yml @@ -0,0 +1,66 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: ResNet + neck: PPYOLOFPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +ResNet: + depth: 50 + variant: d + return_idx: [1, 2, 3] + dcn_v2_stages: [3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +PPYOLOFPN: + coord_conv: true + drop_block: true + block_size: 3 + keep_prob: 0.9 + spp: true + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + iou_aware: true + iou_aware_factor: 0.4 + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + iou_aware_loss: IouAwareLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +IouAwareLoss: + loss_weight: 1.0 + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.01 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MatrixNMS + keep_top_k: 100 + score_threshold: 0.01 + post_threshold: 0.01 + nms_top_k: -1 + background_label: -1 diff --git a/configs/ppyolo/_base_/ppyolo_reader.yml b/configs/ppyolo/_base_/ppyolo_reader.yml new file mode 100644 index 0000000..0e9e0cc --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_reader.yml @@ -0,0 +1,43 @@ +worker_num: 2 +TrainReader: + inputs_def: + num_max_boxes: 50 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 50} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]} + batch_size: 24 + shuffle: true + drop_last: true + mixup_epoch: 25000 + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 608, 608] + sample_transforms: + - Decode: {} + - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 diff --git a/configs/ppyolo/_base_/ppyolo_tiny.yml b/configs/ppyolo/_base_/ppyolo_tiny.yml new file mode 100644 index 0000000..d03e2bb --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_tiny.yml @@ -0,0 +1,55 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: MobileNetV3 + neck: PPYOLOTinyFPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNetV3: + model_name: large + scale: .5 + with_extra_blocks: false + extra_block_filters: [] + feature_maps: [7, 13, 16] + +PPYOLOTinyFPN: + detection_block_channels: [160, 128, 96] + spp: true + drop_block: true + +YOLOv3Head: + anchors: [[10, 15], [24, 36], [72, 42], + [35, 87], [102, 96], [60, 170], + [220, 125], [128, 222], [264, 266]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.5 + downsample: [32, 16, 8] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MultiClassNMS + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + score_threshold: 0.005 diff --git a/configs/ppyolo/_base_/ppyolo_tiny_reader.yml b/configs/ppyolo/_base_/ppyolo_tiny_reader.yml new file mode 100644 index 0000000..4cbc090 --- /dev/null +++ b/configs/ppyolo/_base_/ppyolo_tiny_reader.yml @@ -0,0 +1,43 @@ +worker_num: 4 +TrainReader: + inputs_def: + num_max_boxes: 100 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 100} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 15], [24, 36], [72, 42], [35, 87], [102, 96], [60, 170], [220, 125], [128, 222], [264, 266]], downsample_ratios: [32, 16, 8]} + batch_size: 32 + shuffle: true + drop_last: true + mixup_epoch: 500 + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 320, 320] + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 diff --git a/configs/ppyolo/_base_/ppyolov2_r50vd_dcn.yml b/configs/ppyolo/_base_/ppyolov2_r50vd_dcn.yml new file mode 100644 index 0000000..6288ade --- /dev/null +++ b/configs/ppyolo/_base_/ppyolov2_r50vd_dcn.yml @@ -0,0 +1,65 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +YOLOv3: + backbone: ResNet + neck: PPYOLOPAN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +ResNet: + depth: 50 + variant: d + return_idx: [1, 2, 3] + dcn_v2_stages: [3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +PPYOLOPAN: + drop_block: true + block_size: 3 + keep_prob: 0.9 + spp: true + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + iou_aware: true + iou_aware_factor: 0.5 + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + scale_x_y: 1.05 + iou_loss: IouLoss + iou_aware_loss: IouAwareLoss + +IouLoss: + loss_weight: 2.5 + loss_square: true + +IouAwareLoss: + loss_weight: 1.0 + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.01 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MatrixNMS + keep_top_k: 100 + score_threshold: 0.01 + post_threshold: 0.01 + nms_top_k: -1 + background_label: -1 diff --git a/configs/ppyolo/_base_/ppyolov2_reader.yml b/configs/ppyolo/_base_/ppyolov2_reader.yml new file mode 100644 index 0000000..7472531 --- /dev/null +++ b/configs/ppyolo/_base_/ppyolov2_reader.yml @@ -0,0 +1,43 @@ +worker_num: 8 +TrainReader: + inputs_def: + num_max_boxes: 100 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 100} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]} + batch_size: 12 + shuffle: true + drop_last: true + mixup_epoch: 25000 + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 640, 640] + sample_transforms: + - Decode: {} + - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 diff --git a/configs/ppyolo/ppyolo_mbv3_large_coco.yml b/configs/ppyolo/ppyolo_mbv3_large_coco.yml new file mode 100644 index 0000000..d51696d --- /dev/null +++ b/configs/ppyolo/ppyolo_mbv3_large_coco.yml @@ -0,0 +1,82 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_mbv3_large.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 10 +weights: output/ppyolo_mbv3_large_coco/model_final + +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: + target_size: [224, 256, 288, 320, 352, 384, 416, 448, 480, 512] + random_size: True + random_interp: True + keep_ratio: False + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: + anchor_masks: [[3, 4, 5], [0, 1, 2]] + anchors: [[11, 18], [34, 47], [51, 126], [115, 71], [120, 195], [254, 235]] + downsample_ratios: [32, 16] + iou_thresh: 0.25 + num_classes: 80 + batch_size: 32 + mixup_epoch: 200 + shuffle: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 320, 320] + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 + +epoch: 270 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 162 + - 216 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/ppyolo_mbv3_small_coco.yml b/configs/ppyolo/ppyolo_mbv3_small_coco.yml new file mode 100644 index 0000000..6dba297 --- /dev/null +++ b/configs/ppyolo/ppyolo_mbv3_small_coco.yml @@ -0,0 +1,82 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_mbv3_small.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 10 +weights: output/ppyolo_mbv3_small_coco/model_final + +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: + target_size: [224, 256, 288, 320, 352, 384, 416, 448, 480, 512] + random_size: True + random_interp: True + keep_ratio: False + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: + anchor_masks: [[3, 4, 5], [0, 1, 2]] + anchors: [[11, 18], [34, 47], [51, 126], [115, 71], [120, 195], [254, 235]] + downsample_ratios: [32, 16] + iou_thresh: 0.25 + num_classes: 80 + batch_size: 32 + mixup_epoch: 200 + shuffle: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 320, 320] + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 + +epoch: 270 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 162 + - 216 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/ppyolo_r18vd_coco.yml b/configs/ppyolo/ppyolo_r18vd_coco.yml new file mode 100644 index 0000000..c15800e --- /dev/null +++ b/configs/ppyolo/ppyolo_r18vd_coco.yml @@ -0,0 +1,82 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_r18vd.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 10 +weights: output/ppyolo_r18vd_coco/model_final + +TrainReader: + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: + target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] + random_size: True + random_interp: True + keep_ratio: False + - NormalizeBox: {} + - PadBox: {num_max_boxes: 50} + - BboxXYXY2XYWH: {} + - NormalizeImage: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + is_scale: True + - Permute: {} + - Gt2YoloTarget: + anchor_masks: [[3, 4, 5], [0, 1, 2]] + anchors: [[10, 14], [23, 27], [37, 58], [81, 82], [135, 169], [344, 319]] + downsample_ratios: [32, 16] + + batch_size: 32 + mixup_epoch: 500 + shuffle: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [512, 512], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 8 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 512, 512] + sample_transforms: + - Decode: {} + - Resize: {target_size: [512, 512], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 + +epoch: 270 + +LearningRate: + base_lr: 0.004 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 162 + - 216 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml b/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml new file mode 100644 index 0000000..918f340 --- /dev/null +++ b/configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_r50vd_dcn.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 16 +weights: output/ppyolo_r50vd_dcn_1x_coco/model_final diff --git a/configs/ppyolo/ppyolo_r50vd_dcn_1x_minicoco.yml b/configs/ppyolo/ppyolo_r50vd_dcn_1x_minicoco.yml new file mode 100644 index 0000000..87b976b --- /dev/null +++ b/configs/ppyolo/ppyolo_r50vd_dcn_1x_minicoco.yml @@ -0,0 +1,44 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_r50vd_dcn.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 8 +use_ema: true +weights: output/ppyolo_r50vd_dcn_1x_minicoco/model_final + +TrainReader: + batch_size: 12 + +TrainDataset: + !COCODataSet + image_dir: train2017 + # refer to https://github.com/giddyyupp/coco-minitrain + anno_path: annotations/instances_minitrain2017.json + dataset_dir: dataset/coco + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +epoch: 192 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 153 + - 173 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml b/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml new file mode 100644 index 0000000..ac6531f --- /dev/null +++ b/configs/ppyolo/ppyolo_r50vd_dcn_2x_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_r50vd_dcn.yml', + './_base_/optimizer_2x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 16 +weights: output/ppyolo_r50vd_dcn_2x_coco/model_final diff --git a/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml b/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml new file mode 100644 index 0000000..eac22ce --- /dev/null +++ b/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml @@ -0,0 +1,40 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + './_base_/ppyolo_r50vd_dcn.yml', + './_base_/optimizer_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 83 +weights: output/ppyolo_r50vd_dcn_voc/model_final + +TrainReader: + mixup_epoch: 350 + batch_size: 12 + +EvalReader: + batch_transforms: + - PadBatch: {pad_gt: True} + +epoch: 583 + +LearningRate: + base_lr: 0.00333 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 466 + - 516 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ppyolo/ppyolo_test.yml b/configs/ppyolo/ppyolo_test.yml new file mode 100644 index 0000000..928f1c9 --- /dev/null +++ b/configs/ppyolo/ppyolo_test.yml @@ -0,0 +1,15 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_r50vd_dcn.yml', + './_base_/ppyolo_1x.yml', + './_base_/ppyolo_reader.yml', +] + +snapshot_epoch: 16 + +EvalDataset: + !COCODataSet + image_dir: test2017 + anno_path: annotations/image_info_test-dev2017.json + dataset_dir: dataset/coco diff --git a/configs/ppyolo/ppyolo_tiny_650e_coco.yml b/configs/ppyolo/ppyolo_tiny_650e_coco.yml new file mode 100644 index 0000000..288a0eb --- /dev/null +++ b/configs/ppyolo/ppyolo_tiny_650e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolo_tiny.yml', + './_base_/optimizer_650e.yml', + './_base_/ppyolo_tiny_reader.yml', +] + +snapshot_epoch: 1 +weights: output/ppyolo_tiny_650e_coco/model_final diff --git a/configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml b/configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml new file mode 100644 index 0000000..0f1aee7 --- /dev/null +++ b/configs/ppyolo/ppyolov2_r101vd_dcn_365e_coco.yml @@ -0,0 +1,20 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolov2_r50vd_dcn.yml', + './_base_/optimizer_365e.yml', + './_base_/ppyolov2_reader.yml', +] + +snapshot_epoch: 8 +weights: output/ppyolov2_r101vd_dcn_365e_coco/model_final +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_ssld_pretrained.pdparams + +ResNet: + depth: 101 + variant: d + return_idx: [1, 2, 3] + dcn_v2_stages: [3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. diff --git a/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml b/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml new file mode 100644 index 0000000..a5e1bc3 --- /dev/null +++ b/configs/ppyolo/ppyolov2_r50vd_dcn_365e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyolov2_r50vd_dcn.yml', + './_base_/optimizer_365e.yml', + './_base_/ppyolov2_reader.yml', +] + +snapshot_epoch: 8 +weights: output/ppyolov2_r50vd_dcn_365e_coco/model_final diff --git a/configs/rcnn_enhance/README.md b/configs/rcnn_enhance/README.md new file mode 100644 index 0000000..4a53da5 --- /dev/null +++ b/configs/rcnn_enhance/README.md @@ -0,0 +1,12 @@ +## 服务器端实用目标检测方案 + +### 简介 + +* 近年来,学术界和工业界广泛关注图像中目标检测任务。基于[PaddleClas](https://github.com/PaddlePaddle/PaddleClas)中SSLD蒸馏方案训练得到的ResNet50_vd预训练模型(ImageNet1k验证集上Top1 Acc为82.39%),结合PaddleDetection中的丰富算子,飞桨提供了一种面向服务器端实用的目标检测方案PSS-DET(Practical Server Side Detection)。基于COCO2017目标检测数据集,V100单卡预测速度为为61FPS时,COCO mAP可达41.2%。 + + +### 模型库 + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | Mask AP | 下载 | 配置文件 | +| :---------------------- | :-------------: | :-------: | :-----: | :------------: | :----: | :-----: | :-------------: | :-----: | +| ResNet50-vd-FPN-Dcnv2 | Faster | 2 | 3x | 61.425 | 41.5 | - | [下载链接](https://paddledet.bj.bcebos.com/models/faster_rcnn_enhance_3x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/rcnn_enhance/faster_rcnn_enhance_3x_coco.yml) | diff --git a/configs/rcnn_enhance/_base_/faster_rcnn_enhance.yml b/configs/rcnn_enhance/_base_/faster_rcnn_enhance.yml new file mode 100644 index 0000000..d47fd2c --- /dev/null +++ b/configs/rcnn_enhance/_base_/faster_rcnn_enhance.yml @@ -0,0 +1,81 @@ +architecture: FasterRCNN +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams + +FasterRCNN: + backbone: ResNet + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + # post process + bbox_post_process: BBoxPostProcess + + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + variant: d + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + dcn_v2_stages: [1,2,3] + lr_mult_list: [0.05, 0.05, 0.1, 0.15] + +FPN: + in_channels: [256, 512, 1024, 2048] + out_channel: 64 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 500 + post_nms_top_n: 300 + + +BBoxHead: + head: TwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxLibraAssigner + bbox_loss: DIouLoss + +TwoFCHead: + out_channel: 1024 + +BBoxLibraAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +DIouLoss: + loss_weight: 10.0 + use_complete_iou_loss: true + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/configs/rcnn_enhance/_base_/faster_rcnn_enhance_reader.yml b/configs/rcnn_enhance/_base_/faster_rcnn_enhance_reader.yml new file mode 100644 index 0000000..da6ce65 --- /dev/null +++ b/configs/rcnn_enhance/_base_/faster_rcnn_enhance_reader.yml @@ -0,0 +1,41 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomResize: {target_size: [[384,1000], [416,1000], [448,1000], [480,1000], [512,1000], [544,1000], [576,1000], [608,1000], [640,1000], [672,1000]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - AutoAugment: {autoaug_type: v1} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: true} + batch_size: 2 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [640, 640], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [640, 640], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32, pad_gt: false} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/rcnn_enhance/_base_/optimizer_3x.yml b/configs/rcnn_enhance/_base_/optimizer_3x.yml new file mode 100644 index 0000000..8bd85fa --- /dev/null +++ b/configs/rcnn_enhance/_base_/optimizer_3x.yml @@ -0,0 +1,19 @@ +epoch: 36 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [24, 33] + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/rcnn_enhance/faster_rcnn_enhance_3x_coco.yml b/configs/rcnn_enhance/faster_rcnn_enhance_3x_coco.yml new file mode 100644 index 0000000..a49f245 --- /dev/null +++ b/configs/rcnn_enhance/faster_rcnn_enhance_3x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_3x.yml', + '_base_/faster_rcnn_enhance.yml', + '_base_/faster_rcnn_enhance_reader.yml', +] +weights: output/faster_rcnn_enhance_r50_3x_coco/model_final diff --git a/configs/runtime.yml b/configs/runtime.yml new file mode 100644 index 0000000..4c8d0b4 --- /dev/null +++ b/configs/runtime.yml @@ -0,0 +1,4 @@ +use_gpu: false +log_iter: 20 +save_dir: output +snapshot_epoch: 1 diff --git a/configs/slim/README.md b/configs/slim/README.md new file mode 100644 index 0000000..8a07b08 --- /dev/null +++ b/configs/slim/README.md @@ -0,0 +1,131 @@ +# 模型压缩 + +在PaddleDetection中, 提供了基于[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)进行模型压缩的完整教程和benchmark。目前支持的方法: + +- [剪裁](prune) +- [量化](quant) +- [蒸馏](distill) +- [联合策略](extensions) + +推荐您使用剪裁和蒸馏联合训练,或者使用剪裁和量化,进行检测模型压缩。 下面以YOLOv3为例,进行剪裁、蒸馏和量化实验。 + +## 实验环境 + +- Python 3.7+ +- PaddlePaddle >= 2.0.1 +- PaddleSlim >= 2.0.0 +- CUDA 9.0+ +- cuDNN >=7.5 + +**注意:** 量化训练需要依赖Paddle develop分支,可在[PaddlePaddle每日版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev)中下载安装合适的PaddlePaddle版本。 + +#### 安装PaddleSlim +- 方法一:直接安装: +``` +pip install paddleslim -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +- 方法二:编译安装: +``` +git clone https://github.com/PaddlePaddle/PaddleSlim.git +cd PaddleSlim +python setup.py install +``` + +## 快速开始 + +### 训练 + +```shell +python tools/train.py -c configs/{MODEL.yml} --slim_config configs/slim/{SLIM_CONFIG.yml} +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 + + +### 评估 + +```shell +python tools/eval.py -c configs/{MODEL.yml} --slim_config configs/slim/{SLIM_CONFIG.yml} -o weights=output/{SLIM_CONFIG}/model_final +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 +- `-o weights`: 指定压缩算法训好的模型路径。 + +### 测试 + +```shell +python tools/infer.py -c configs/{MODEL.yml} --slim_config configs/slim/{SLIM_CONFIG.yml} \ + -o weights=output/{SLIM_CONFIG}/model_final + --infer_img={IMAGE_PATH} +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 +- `-o weights`: 指定压缩算法训好的模型路径。 +- `--infer_img`: 指定测试图像路径。 + + +### 动转静导出模型 + +```shell +python tools/export_model.py -c configs/{MODEL.yml} --slim_config configs/slim/{SLIM_CONFIG.yml} -o weights=output/{SLIM_CONFIG}/model_final +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 +- `-o weights`: 指定压缩算法训好的模型路径。 + + +## Benchmark + +### 剪裁 + +#### Pascal VOC上benchmark + +| 模型 | 压缩策略 | GFLOPs | 模型体积(MB) | 输入尺寸 | 预测时延(SD855)| Box AP | 下载 | 模型配置文件 | 压缩算法配置文件 | +| :----------------| :-------: | :------------: | :-------------: | :------: | :--------: | :------: | :-----------------------------------------------------: |:-------------: | :------: | +| YOLOv3-MobileNetV1 | baseline | 24.13 | 93 | 608 | 289.9ms | 75.1 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml) | - | +| YOLOv3-MobileNetV1 | 剪裁-l1_norm(sensity) | 15.78(-34.49%) | 66(-29%) | 608 | - | 78.4(+3.3) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v1_voc_prune_l1_norm.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/prune/yolov3_prune_l1_norm.yml) | + +- 目前剪裁支持YOLO系列、SSD、TTFNet、BlazeFace,其余模型正在开发支持中。 +- SD855预测时延为使用PaddleLite部署,使用arm8架构并使用4线程(4 Threads)推理时延。 + +### 量化 + +#### COCO上benchmark + +| 模型 | 压缩策略 | 输入尺寸 | Box AP | 下载 | 模型配置文件 | 压缩算法配置文件 | +| ------------------ | ------------ | -------- | :---------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| YOLOv3-MobileNetV1 | baseline | 608 | 28.8 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | - | +| YOLOv3-MobileNetV1 | 普通在线量化 | 608 | 30.5 (+1.7) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v1_coco_qat.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/quant/yolov3_mobilenet_v1_qat.yml) | +| YOLOv3-MobileNetV3 | baseline | 608 | 31.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml) | - | +| YOLOv3-MobileNetV3 | PACT在线量化 | 608 | 29.1 (-2.3) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v3_coco_qat.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/quant/yolov3_mobilenet_v3_qat.yml) | +| YOLOv3-DarkNet53 | baseline | 608 | 39.0 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) | - | +| YOLOv3-DarkNet53 | 普通在线量化 | 608 | 38.8 (-0.2) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_darknet_coco_qat.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/quant/yolov3_darknet_qat.yml) | +| SSD-MobileNet_v1 | baseline | 300 | 73.8 | [下载链接](https://paddledet.bj.bcebos.com/models/ssd_mobilenet_v1_300_120e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml) | - | +| SSD-MobileNet_v1 | 普通在线量化 | 300 | 72.9(-0.9) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/ssd_mobilenet_v1_300_voc_qat.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/quant/ssd_mobilenet_v1_qat.yml) | +| Mask-ResNet50-FPN | baseline | (800, 1333) | 39.2/35.6 | [下载链接](https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_fpn_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml) | - | +| Mask-ResNet50-FPN | 普通在线量化 | (800, 1333) | 39.7(+0.5)/35.9(+0.3) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/mask_rcnn_r50_fpn_1x_qat.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/quant/mask_rcnn_r50_fpn_1x_qat.yml) | + + +### 蒸馏 + +#### COCO上benchmark + +| 模型 | 压缩策略 | 输入尺寸 | Box AP | 下载 | 模型配置文件 | 压缩算法配置文件 | +| ------------------ | ------------ | -------- | :---------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| YOLOv3-MobileNetV1 | baseline | 608 | 29.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | - | +| YOLOv3-MobileNetV1 | 蒸馏 | 608 | 31.0(+1.6) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v1_coco_distill.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/distill/yolov3_mobilenet_v1_coco_distill.yml) | + +- 具体蒸馏方法请参考[蒸馏策略文档](distill/README.md) + +### 蒸馏剪裁联合策略 + +#### COCO上benchmark + +| 模型 | 压缩策略 | 输入尺寸 | GFLOPs | 模型体积(MB) | Box AP | 下载 | 模型配置文件 | 压缩算法配置文件 | +| ------------------ | ------------ | -------- | :---------: |:---------: | :---------: |:----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| YOLOv3-MobileNetV1 | baseline | 608 | 24.65 | 94.6 | 29.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | - | +| YOLOv3-MobileNetV1 | 蒸馏+剪裁 | 608 | 7.54(-69.4%) | 32.0(-66.0%) | 28.4(-1.0) | [下载链接](https://paddledet.bj.bcebos.com/models/slim/yolov3_mobilenet_v1_coco_distill_prune.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | [slim配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/slim/extensions/yolov3_mobilenet_v1_coco_distill_prune.yml) | diff --git a/configs/slim/distill/README.md b/configs/slim/distill/README.md new file mode 100644 index 0000000..da57957 --- /dev/null +++ b/configs/slim/distill/README.md @@ -0,0 +1,18 @@ +# Distillation(蒸馏) + +## YOLOv3模型蒸馏 +以YOLOv3-MobileNetV1为例,使用YOLOv3-ResNet34作为蒸馏训练的teacher网络, 对YOLOv3-MobileNetV1结构的student网络进行蒸馏。 +COCO数据集作为目标检测任务的训练目标难度更大,意味着teacher网络会预测出更多的背景bbox,如果直接用teacher的预测输出作为student学习的`soft label`会有严重的类别不均衡问题。解决这个问题需要引入新的方法,详细背景请参考论文:[Object detection at 200 Frames Per Second](https://arxiv.org/abs/1805.06361)。 +为了确定蒸馏的对象,我们首先需要找到student和teacher网络得到的`x,y,w,h,cls,objness`等Tensor,用teacher得到的结果指导student训练。具体实现可参考[代码](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/ppdet/slim/distill.py) + +## Citations +``` +@article{mehta2018object, + title={Object detection at 200 Frames Per Second}, + author={Rakesh Mehta and Cemalettin Ozturk}, + year={2018}, + eprint={1805.06361}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/slim/distill/yolov3_mobilenet_v1_coco_distill.yml b/configs/slim/distill/yolov3_mobilenet_v1_coco_distill.yml new file mode 100644 index 0000000..9998dec --- /dev/null +++ b/configs/slim/distill/yolov3_mobilenet_v1_coco_distill.yml @@ -0,0 +1,12 @@ +_BASE_: [ + '../../yolov3/yolov3_r34_270e_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams + + +slim: Distill +distill_loss: DistillYOLOv3Loss + +DistillYOLOv3Loss: + weight: 1000 diff --git a/configs/slim/extensions/yolov3_mobilenet_v1_coco_distill_prune.yml b/configs/slim/extensions/yolov3_mobilenet_v1_coco_distill_prune.yml new file mode 100644 index 0000000..f86fac5 --- /dev/null +++ b/configs/slim/extensions/yolov3_mobilenet_v1_coco_distill_prune.yml @@ -0,0 +1,24 @@ +_BASE_: [ + '../../yolov3/yolov3_r34_270e_coco.yml', +] + +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams + +slim: DistillPrune + +distill_loss: DistillYOLOv3Loss + +DistillYOLOv3Loss: + weight: 1000 + +pruner: Pruner + +Pruner: + criterion: l1_norm + pruned_params: ['conv2d_27.w_0', 'conv2d_28.w_0', 'conv2d_29.w_0', + 'conv2d_30.w_0', 'conv2d_31.w_0', 'conv2d_32.w_0', + 'conv2d_34.w_0', 'conv2d_35.w_0', 'conv2d_36.w_0', + 'conv2d_37.w_0', 'conv2d_38.w_0', 'conv2d_39.w_0', + 'conv2d_41.w_0', 'conv2d_42.w_0', 'conv2d_43.w_0', + 'conv2d_44.w_0', 'conv2d_45.w_0', 'conv2d_46.w_0'] + pruned_ratios: [0.5,0.5,0.5,0.5,0.5,0.5,0.7,0.7,0.7,0.7,0.7,0.7,0.8,0.8,0.8,0.8,0.8,0.8] diff --git a/configs/slim/prune/yolov3_prune_fpgm.yml b/configs/slim/prune/yolov3_prune_fpgm.yml new file mode 100644 index 0000000..f374538 --- /dev/null +++ b/configs/slim/prune/yolov3_prune_fpgm.yml @@ -0,0 +1,14 @@ +# Weights of yolov3_mobilenet_v1_voc +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams +slim: Pruner + +Pruner: + criterion: fpgm + pruned_params: ['conv2d_27.w_0', 'conv2d_28.w_0', 'conv2d_29.w_0', + 'conv2d_30.w_0', 'conv2d_31.w_0', 'conv2d_32.w_0', + 'conv2d_34.w_0', 'conv2d_35.w_0', 'conv2d_36.w_0', + 'conv2d_37.w_0', 'conv2d_38.w_0', 'conv2d_39.w_0', + 'conv2d_41.w_0', 'conv2d_42.w_0', 'conv2d_43.w_0', + 'conv2d_44.w_0', 'conv2d_45.w_0', 'conv2d_46.w_0'] + pruned_ratios: [0.1,0.2,0.2,0.2,0.2,0.1,0.2,0.3,0.3,0.3,0.2,0.1,0.3,0.4,0.4,0.4,0.4,0.3] + print_params: False diff --git a/configs/slim/prune/yolov3_prune_l1_norm.yml b/configs/slim/prune/yolov3_prune_l1_norm.yml new file mode 100644 index 0000000..5b4f466 --- /dev/null +++ b/configs/slim/prune/yolov3_prune_l1_norm.yml @@ -0,0 +1,14 @@ +# Weights of yolov3_mobilenet_v1_voc +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams +slim: Pruner + +Pruner: + criterion: l1_norm + pruned_params: ['conv2d_27.w_0', 'conv2d_28.w_0', 'conv2d_29.w_0', + 'conv2d_30.w_0', 'conv2d_31.w_0', 'conv2d_32.w_0', + 'conv2d_34.w_0', 'conv2d_35.w_0', 'conv2d_36.w_0', + 'conv2d_37.w_0', 'conv2d_38.w_0', 'conv2d_39.w_0', + 'conv2d_41.w_0', 'conv2d_42.w_0', 'conv2d_43.w_0', + 'conv2d_44.w_0', 'conv2d_45.w_0', 'conv2d_46.w_0'] + pruned_ratios: [0.1,0.2,0.2,0.2,0.2,0.1,0.2,0.3,0.3,0.3,0.2,0.1,0.3,0.4,0.4,0.4,0.4,0.3] + print_params: False diff --git a/configs/slim/quant/mask_rcnn_r50_fpn_1x_qat.yml b/configs/slim/quant/mask_rcnn_r50_fpn_1x_qat.yml new file mode 100644 index 0000000..7363b4e --- /dev/null +++ b/configs/slim/quant/mask_rcnn_r50_fpn_1x_qat.yml @@ -0,0 +1,22 @@ +pretrain_weights: https://paddledet.bj.bcebos.com/models/mask_rcnn_r50_fpn_1x_coco.pdparams +slim: QAT + +QAT: + quant_config: { + 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', + 'weight_bits': 8, 'activation_bits': 8, 'dtype': 'int8', 'window_size': 10000, 'moving_rate': 0.9, + 'quantizable_layer_type': ['Conv2D', 'Linear']} + print_model: True + + +epoch: 5 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [3, 4] + - !LinearWarmup + start_factor: 0.001 + steps: 100 diff --git a/configs/slim/quant/ssd_mobilenet_v1_qat.yml b/configs/slim/quant/ssd_mobilenet_v1_qat.yml new file mode 100644 index 0000000..05e0683 --- /dev/null +++ b/configs/slim/quant/ssd_mobilenet_v1_qat.yml @@ -0,0 +1,9 @@ +pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/dygraph/ssd_mobilenet_v1_300_120e_voc.pdparams +slim: QAT + +QAT: + quant_config: { + 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', + 'weight_bits': 8, 'activation_bits': 8, 'dtype': 'int8', 'window_size': 10000, 'moving_rate': 0.9, + 'quantizable_layer_type': ['Conv2D', 'Linear']} + print_model: True diff --git a/configs/slim/quant/yolov3_darknet_qat.yml b/configs/slim/quant/yolov3_darknet_qat.yml new file mode 100644 index 0000000..281b534 --- /dev/null +++ b/configs/slim/quant/yolov3_darknet_qat.yml @@ -0,0 +1,31 @@ +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams +slim: QAT + +QAT: + quant_config: { + 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', + 'weight_bits': 8, 'activation_bits': 8, 'dtype': 'int8', 'window_size': 10000, 'moving_rate': 0.9, + 'quantizable_layer_type': ['Conv2D', 'Linear']} + print_model: True + +epoch: 50 + +LearningRate: + base_lr: 0.0001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 30 + - 45 + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/slim/quant/yolov3_mobilenet_v1_qat.yml b/configs/slim/quant/yolov3_mobilenet_v1_qat.yml new file mode 100644 index 0000000..d145208 --- /dev/null +++ b/configs/slim/quant/yolov3_mobilenet_v1_qat.yml @@ -0,0 +1,10 @@ +# Weights of yolov3_mobilenet_v1_coco +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams +slim: QAT + +QAT: + quant_config: { + 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', + 'weight_bits': 8, 'activation_bits': 8, 'dtype': 'int8', 'window_size': 10000, 'moving_rate': 0.9, + 'quantizable_layer_type': ['Conv2D', 'Linear']} + print_model: True diff --git a/configs/slim/quant/yolov3_mobilenet_v3_qat.yml b/configs/slim/quant/yolov3_mobilenet_v3_qat.yml new file mode 100644 index 0000000..8126909 --- /dev/null +++ b/configs/slim/quant/yolov3_mobilenet_v3_qat.yml @@ -0,0 +1,24 @@ +# Weights of yolov3_mobilenet_v3_coco +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_coco.pdparams +slim: QAT + +QAT: + quant_config: { + 'weight_preprocess_type': 'PACT', + 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', + 'weight_bits': 8, 'activation_bits': 8, 'dtype': 'int8', 'window_size': 10000, 'moving_rate': 0.9, + 'quantizable_layer_type': ['Conv2D', 'Linear']} + print_model: True + +epoch: 30 +LearningRate: + base_lr: 0.0001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 25 + - 28 + - !LinearWarmup + start_factor: 0. + steps: 2000 diff --git a/configs/solov2/README.md b/configs/solov2/README.md new file mode 100644 index 0000000..b3268df --- /dev/null +++ b/configs/solov2/README.md @@ -0,0 +1,38 @@ +# SOLOv2 for instance segmentation + +## Introduction + +SOLOv2 (Segmenting Objects by Locations) is a fast instance segmentation framework with strong performance. We reproduced the model of the paper, and improved and optimized the accuracy and speed of the SOLOv2. + +**Highlights:** + +- Training Time: The training time of the model of `solov2_r50_fpn_1x` on Tesla v100 with 8 GPU is only 10 hours. + +## Model Zoo + +| Detector | Backbone | Multi-scale training | Lr schd | Mask APval | V100 FP32(FPS) | GPU | Download | Configs | +| :-------: | :---------------------: | :-------------------: | :-----: | :--------------------: | :-------------: | :-----: | :---------: | :------------------------: | +| YOLACT++ | R50-FPN | False | 80w iter | 34.1 (test-dev) | 33.5 | Xp | - | - | +| CenterMask | R50-FPN | True | 2x | 36.4 | 13.9 | Xp | - | - | +| CenterMask | V2-99-FPN | True | 3x | 40.2 | 8.9 | Xp | - | - | +| PolarMask | R50-FPN | True | 2x | 30.5 | 9.4 | V100 | - | - | +| BlendMask | R50-FPN | True | 3x | 37.8 | 13.5 | V100 | - | - | +| SOLOv2 (Paper) | R50-FPN | False | 1x | 34.8 | 18.5 | V100 | - | - | +| SOLOv2 (Paper) | X101-DCN-FPN | True | 3x | 42.4 | 5.9 | V100 | - | - | +| SOLOv2 | R50-FPN | False | 1x | 35.5 | 21.9 | V100 | [model](https://paddledet.bj.bcebos.com/models/solov2_r50_fpn_1x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/solov2/solov2_r50_fpn_1x_coco.yml) | +| SOLOv2 | R50-FPN | True | 3x | 38.0 | 21.9 | V100 | [model](https://paddledet.bj.bcebos.com/models/solov2_r50_fpn_3x_coco.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/solov2/solov2_r50_fpn_3x_coco.yml) | + +**Notes:** + +- SOLOv2 is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95)`. +- SOLOv2 training performace is dependented on Paddle develop branch, performance reproduction shoule based on [Paddle daily version](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev) or Paddle 2.0.1(will be published on 2021.03), performace will loss slightly is training base on Paddle 2.0.0 + +## Citations +``` +@article{wang2020solov2, + title={SOLOv2: Dynamic, Faster and Stronger}, + author={Wang, Xinlong and Zhang, Rufeng and Kong, Tao and Li, Lei and Shen, Chunhua}, + journal={arXiv preprint arXiv:2003.10152}, + year={2020} +} +``` diff --git a/configs/solov2/_base_/optimizer_1x.yml b/configs/solov2/_base_/optimizer_1x.yml new file mode 100644 index 0000000..d034482 --- /dev/null +++ b/configs/solov2/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 diff --git a/configs/solov2/_base_/solov2_r50_fpn.yml b/configs/solov2/_base_/solov2_r50_fpn.yml new file mode 100644 index 0000000..53ec3b2 --- /dev/null +++ b/configs/solov2/_base_/solov2_r50_fpn.yml @@ -0,0 +1,41 @@ +architecture: SOLOv2 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams + +SOLOv2: + backbone: ResNet + neck: FPN + solov2_head: SOLOv2Head + mask_head: SOLOv2MaskHead + +ResNet: + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [0,1,2,3] + num_stages: 4 + +FPN: + out_channel: 256 + +SOLOv2Head: + seg_feat_channels: 512 + stacked_convs: 4 + num_grids: [40, 36, 24, 16, 12] + kernel_out_channels: 256 + solov2_loss: SOLOv2Loss + mask_nms: MaskMatrixNMS + +SOLOv2MaskHead: + mid_channels: 128 + out_channels: 256 + start_level: 0 + end_level: 3 + +SOLOv2Loss: + ins_loss_weight: 3.0 + focal_loss_gamma: 2.0 + focal_loss_alpha: 0.25 + +MaskMatrixNMS: + pre_nms_top_n: 500 + post_nms_top_n: 100 diff --git a/configs/solov2/_base_/solov2_reader.yml b/configs/solov2/_base_/solov2_reader.yml new file mode 100644 index 0000000..cd980d7 --- /dev/null +++ b/configs/solov2/_base_/solov2_reader.yml @@ -0,0 +1,44 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - Poly2Mask: {} + - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True} + - RandomFlip: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + - Gt2Solov2Target: {num_grids: [40, 36, 24, 16, 12], + scale_ranges: [[1, 96], [48, 192], [96, 384], [192, 768], [384, 2048]], + coord_sigma: 0.2} + batch_size: 2 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Resize: {interp: 1, target_size: [800, 1333], keep_ratio: True} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/solov2/solov2_r50_fpn_1x_coco.yml b/configs/solov2/solov2_r50_fpn_1x_coco.yml new file mode 100644 index 0000000..e5f548d --- /dev/null +++ b/configs/solov2/solov2_r50_fpn_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/solov2_r50_fpn.yml', + '_base_/optimizer_1x.yml', + '_base_/solov2_reader.yml', +] +weights: output/solov2_r50_fpn_1x_coco/model_final diff --git a/configs/solov2/solov2_r50_fpn_3x_coco.yml b/configs/solov2/solov2_r50_fpn_3x_coco.yml new file mode 100644 index 0000000..6ffff46 --- /dev/null +++ b/configs/solov2/solov2_r50_fpn_3x_coco.yml @@ -0,0 +1,38 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + '_base_/solov2_r50_fpn.yml', + '_base_/optimizer_1x.yml', + '_base_/solov2_reader.yml', +] +weights: output/solov2_r50_fpn_3x_coco/model_final +epoch: 36 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [24, 33] + - !LinearWarmup + start_factor: 0. + steps: 1000 + +TrainReader: + sample_transforms: + - Decode: {} + - Poly2Mask: {} + - RandomResize: {interp: 1, + target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], + keep_ratio: True} + - RandomFlip: {} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + - Gt2Solov2Target: {num_grids: [40, 36, 24, 16, 12], + scale_ranges: [[1, 96], [48, 192], [96, 384], [192, 768], [384, 2048]], + coord_sigma: 0.2} + batch_size: 2 + shuffle: true + drop_last: true diff --git a/configs/ssd/README.md b/configs/ssd/README.md new file mode 100644 index 0000000..b2bcd67 --- /dev/null +++ b/configs/ssd/README.md @@ -0,0 +1,22 @@ +# SSD: Single Shot MultiBox Detector + +## Model Zoo + +### SSD on Pascal VOC + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | 下载 | 配置文件 | +| :-------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| VGG | SSD | 8 | 240e | ---- | 77.8 | [下载链接](https://paddledet.bj.bcebos.com/models/ssd_vgg16_300_240e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ssd/ssd_vgg16_300_240e_voc.yml) | +| MobileNet v1 | SSD | 32 | 120e | ---- | 73.8 | [下载链接](https://paddledet.bj.bcebos.com/models/ssd_mobilenet_v1_300_120e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml) | + +**注意:** SSD-VGG使用4GPU在总batch size为32下训练240个epoch。SSD-MobileNetv1使用2GPU在总batch size为64下训练120周期。 + +## Citations +``` +@article{Liu_2016, + title={SSD: Single Shot MultiBox Detector}, + journal={ECCV}, + author={Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.}, + year={2016}, +} +``` diff --git a/configs/ssd/_base_/optimizer_120e.yml b/configs/ssd/_base_/optimizer_120e.yml new file mode 100644 index 0000000..0625b66 --- /dev/null +++ b/configs/ssd/_base_/optimizer_120e.yml @@ -0,0 +1,17 @@ +epoch: 120 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + milestones: [40, 60, 80, 100] + values: [0.001, 0.0005, 0.00025, 0.0001, 0.00001] + use_warmup: false + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSProp + regularizer: + factor: 0.00005 + type: L2 diff --git a/configs/ssd/_base_/optimizer_1700e.yml b/configs/ssd/_base_/optimizer_1700e.yml new file mode 100644 index 0000000..fe5fedc --- /dev/null +++ b/configs/ssd/_base_/optimizer_1700e.yml @@ -0,0 +1,18 @@ +epoch: 1700 + +LearningRate: + base_lr: 0.4 + schedulers: + - !CosineDecay + max_epochs: 1700 + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 2000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ssd/_base_/optimizer_240e.yml b/configs/ssd/_base_/optimizer_240e.yml new file mode 100644 index 0000000..de31eac --- /dev/null +++ b/configs/ssd/_base_/optimizer_240e.yml @@ -0,0 +1,21 @@ +epoch: 240 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 160 + - 200 + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ssd/_base_/ssd_mobilenet_reader.yml b/configs/ssd/_base_/ssd_mobilenet_reader.yml new file mode 100644 index 0000000..2af8da2 --- /dev/null +++ b/configs/ssd/_base_/ssd_mobilenet_reader.yml @@ -0,0 +1,40 @@ +worker_num: 8 +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - RandomDistort: {brightness: [0.5, 1.125, 0.875], random_apply: False} + - RandomExpand: {fill_value: [127.5, 127.5, 127.5]} + - RandomCrop: {allow_no_crop: Fasle} + - RandomFlip: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + batch_transforms: + - NormalizeImage: {mean: [127.5, 127.5, 127.5], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 32 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [127.5, 127.5, 127.5], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 1 + drop_empty: false + + +TestReader: + inputs_def: + image_shape: [3, 300, 300] + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [127.5, 127.5, 127.5], std: [127.502231, 127.502231, 127.502231], is_scale: false} + - Permute: {} + batch_size: 1 diff --git a/configs/ssd/_base_/ssd_mobilenet_v1_300.yml b/configs/ssd/_base_/ssd_mobilenet_v1_300.yml new file mode 100644 index 0000000..b8fe694 --- /dev/null +++ b/configs/ssd/_base_/ssd_mobilenet_v1_300.yml @@ -0,0 +1,41 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ssd_mobilenet_v1_coco_pretrained.pdparams + +SSD: + backbone: MobileNet + ssd_head: SSDHead + post_process: BBoxPostProcess + +MobileNet: + norm_decay: 0. + scale: 1 + conv_learning_rate: 0.1 + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + with_extra_blocks: true + feature_maps: [11, 13, 14, 15, 16, 17] + +SSDHead: + kernel_size: 1 + padding: 0 + anchor_generator: + steps: [0, 0, 0, 0, 0, 0] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + min_ratio: 20 + max_ratio: 90 + base_size: 300 + min_sizes: [60.0, 105.0, 150.0, 195.0, 240.0, 285.0] + max_sizes: [[], 150.0, 195.0, 240.0, 285.0, 300.0] + offset: 0.5 + flip: true + min_max_aspect_ratios_order: false + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/_base_/ssd_reader.yml b/configs/ssd/_base_/ssd_reader.yml new file mode 100644 index 0000000..e25bed6 --- /dev/null +++ b/configs/ssd/_base_/ssd_reader.yml @@ -0,0 +1,42 @@ +worker_num: 2 +TrainReader: + inputs_def: + num_max_boxes: 90 + + sample_transforms: + - Decode: {} + - RandomDistort: {brightness: [0.5, 1.125, 0.875], random_apply: False} + - RandomExpand: {fill_value: [104., 117., 123.]} + - RandomCrop: {allow_no_crop: true} + - RandomFlip: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + + batch_transforms: + - NormalizeImage: {mean: [104., 117., 123.], std: [1., 1., 1.], is_scale: false} + - Permute: {} + + batch_size: 8 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [104., 117., 123.], std: [1., 1., 1.], is_scale: false} + - Permute: {} + batch_size: 1 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 300, 300] + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [104., 117., 123.], std: [1., 1., 1.], is_scale: false} + - Permute: {} + batch_size: 1 diff --git a/configs/ssd/_base_/ssd_vgg16_300.yml b/configs/ssd/_base_/ssd_vgg16_300.yml new file mode 100644 index 0000000..5982105 --- /dev/null +++ b/configs/ssd/_base_/ssd_vgg16_300.yml @@ -0,0 +1,37 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/VGG16_caffe_pretrained.pdparams + +# Model Achitecture +SSD: + # model feat info flow + backbone: VGG + ssd_head: SSDHead + # post process + post_process: BBoxPostProcess + +VGG: + depth: 16 + normalizations: [20., -1, -1, -1, -1, -1] + +SSDHead: + anchor_generator: + steps: [8, 16, 32, 64, 100, 300] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 20 + max_ratio: 90 + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + offset: 0.5 + flip: true + min_max_aspect_ratios_order: true + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/_base_/ssdlite300_reader.yml b/configs/ssd/_base_/ssdlite300_reader.yml new file mode 100644 index 0000000..cd13112 --- /dev/null +++ b/configs/ssd/_base_/ssdlite300_reader.yml @@ -0,0 +1,40 @@ +worker_num: 8 +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - RandomDistort: {brightness: [0.5, 1.125, 0.875], random_apply: False} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {allow_no_crop: Fasle} + - RandomFlip: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + batch_transforms: + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 64 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 1 + drop_empty: false + + +TestReader: + inputs_def: + image_shape: [3, 300, 300] + sample_transforms: + - Decode: {} + - Resize: {target_size: [300, 300], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 1 diff --git a/configs/ssd/_base_/ssdlite320_reader.yml b/configs/ssd/_base_/ssdlite320_reader.yml new file mode 100644 index 0000000..51db614 --- /dev/null +++ b/configs/ssd/_base_/ssdlite320_reader.yml @@ -0,0 +1,40 @@ +worker_num: 8 +TrainReader: + inputs_def: + num_max_boxes: 90 + sample_transforms: + - Decode: {} + - RandomDistort: {brightness: [0.5, 1.125, 0.875], random_apply: False} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {allow_no_crop: Fasle} + - RandomFlip: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 1} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 90} + batch_transforms: + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 64 + shuffle: true + drop_last: true + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 1 + drop_empty: false + + +TestReader: + inputs_def: + image_shape: [3, 320, 320] + sample_transforms: + - Decode: {} + - Resize: {target_size: [320, 320], keep_ratio: False, interp: 1} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: true} + - Permute: {} + batch_size: 1 diff --git a/configs/ssd/_base_/ssdlite_ghostnet_320.yml b/configs/ssd/_base_/ssdlite_ghostnet_320.yml new file mode 100644 index 0000000..6a9e13b --- /dev/null +++ b/configs/ssd/_base_/ssdlite_ghostnet_320.yml @@ -0,0 +1,42 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/GhostNet_x1_3_ssld_pretrained.pdparams + +SSD: + backbone: GhostNet + ssd_head: SSDHead + post_process: BBoxPostProcess + +GhostNet: + scale: 1.3 + conv_decay: 0.00004 + with_extra_blocks: true + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + feature_maps: [13, 18, 19, 20, 21, 22] + lr_mult_list: [0.25, 0.25, 0.5, 0.5, 0.75] + +SSDHead: + use_sepconv: True + conv_decay: 0.00004 + anchor_generator: + steps: [16, 32, 64, 107, 160, 320] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + min_ratio: 20 + max_ratio: 95 + base_size: 320 + min_sizes: [] + max_sizes: [] + offset: 0.5 + flip: true + clip: true + min_max_aspect_ratios_order: false + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/_base_/ssdlite_mobilenet_v1_300.yml b/configs/ssd/_base_/ssdlite_mobilenet_v1_300.yml new file mode 100644 index 0000000..db811ad --- /dev/null +++ b/configs/ssd/_base_/ssdlite_mobilenet_v1_300.yml @@ -0,0 +1,41 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV1_ssld_pretrained.pdparams + +SSD: + backbone: MobileNet + ssd_head: SSDHead + post_process: BBoxPostProcess + +MobileNet: + conv_decay: 0.00004 + scale: 1 + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + with_extra_blocks: true + feature_maps: [11, 13, 14, 15, 16, 17] + +SSDHead: + use_sepconv: True + conv_decay: 0.00004 + anchor_generator: + steps: [16, 32, 64, 100, 150, 300] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + min_ratio: 20 + max_ratio: 95 + base_size: 300 + min_sizes: [] + max_sizes: [] + offset: 0.5 + flip: true + clip: true + min_max_aspect_ratios_order: False + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/_base_/ssdlite_mobilenet_v3_large_320.yml b/configs/ssd/_base_/ssdlite_mobilenet_v3_large_320.yml new file mode 100644 index 0000000..cc6e328 --- /dev/null +++ b/configs/ssd/_base_/ssdlite_mobilenet_v3_large_320.yml @@ -0,0 +1,44 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams + +SSD: + backbone: MobileNetV3 + ssd_head: SSDHead + post_process: BBoxPostProcess + +MobileNetV3: + scale: 1.0 + model_name: large + conv_decay: 0.00004 + with_extra_blocks: true + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + feature_maps: [14, 17, 18, 19, 20, 21] + lr_mult_list: [0.25, 0.25, 0.5, 0.5, 0.75] + multiplier: 0.5 + +SSDHead: + use_sepconv: True + conv_decay: 0.00004 + anchor_generator: + steps: [16, 32, 64, 107, 160, 320] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + min_ratio: 20 + max_ratio: 95 + base_size: 320 + min_sizes: [] + max_sizes: [] + offset: 0.5 + flip: true + clip: true + min_max_aspect_ratios_order: false + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/_base_/ssdlite_mobilenet_v3_small_320.yml b/configs/ssd/_base_/ssdlite_mobilenet_v3_small_320.yml new file mode 100644 index 0000000..887f95f --- /dev/null +++ b/configs/ssd/_base_/ssdlite_mobilenet_v3_small_320.yml @@ -0,0 +1,44 @@ +architecture: SSD +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_small_x1_0_ssld_pretrained.pdparams + +SSD: + backbone: MobileNetV3 + ssd_head: SSDHead + post_process: BBoxPostProcess + +MobileNetV3: + scale: 1.0 + model_name: small + conv_decay: 0.00004 + with_extra_blocks: true + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + feature_maps: [10, 13, 14, 15, 16, 17] + lr_mult_list: [0.25, 0.25, 0.5, 0.5, 0.75] + multiplier: 0.5 + +SSDHead: + use_sepconv: True + conv_decay: 0.00004 + anchor_generator: + steps: [16, 32, 64, 107, 160, 320] + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + min_ratio: 20 + max_ratio: 95 + base_size: 320 + min_sizes: [] + max_sizes: [] + offset: 0.5 + flip: true + clip: true + min_max_aspect_ratios_order: false + +BBoxPostProcess: + decode: + name: SSDBox + nms: + name: MultiClassNMS + keep_top_k: 200 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 400 + nms_eta: 1.0 diff --git a/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml b/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml new file mode 100644 index 0000000..3453f02 --- /dev/null +++ b/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml @@ -0,0 +1,12 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_120e.yml', + '_base_/ssd_mobilenet_v1_300.yml', + '_base_/ssd_mobilenet_reader.yml', +] +weights: output/ssd_mobilenet_v1_300_120e_voc/model_final + +EvalReader: + batch_transforms: + - PadBatch: {pad_gt: True} diff --git a/configs/ssd/ssd_vgg16_300_240e_voc.yml b/configs/ssd/ssd_vgg16_300_240e_voc.yml new file mode 100644 index 0000000..e2e2d30 --- /dev/null +++ b/configs/ssd/ssd_vgg16_300_240e_voc.yml @@ -0,0 +1,12 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_240e.yml', + '_base_/ssd_vgg16_300.yml', + '_base_/ssd_reader.yml', +] +weights: output/ssd_vgg16_300_240e_voc/model_final + +EvalReader: + batch_transforms: + - PadBatch: {pad_gt: True} diff --git a/configs/ssd/ssdlite_ghostnet_320_coco.yml b/configs/ssd/ssdlite_ghostnet_320_coco.yml new file mode 100644 index 0000000..c6eb6c1 --- /dev/null +++ b/configs/ssd/ssdlite_ghostnet_320_coco.yml @@ -0,0 +1,27 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1700e.yml', + '_base_/ssdlite_ghostnet_320.yml', + '_base_/ssdlite320_reader.yml', +] +weights: output/ssdlite_ghostnet_320_coco/model_final + +epoch: 1700 + +LearningRate: + base_lr: 0.2 + schedulers: + - !CosineDecay + max_epochs: 1700 + - !LinearWarmup + start_factor: 0.33333 + steps: 2000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/ssd/ssdlite_mobilenet_v1_300_coco.yml b/configs/ssd/ssdlite_mobilenet_v1_300_coco.yml new file mode 100644 index 0000000..75cb8a8 --- /dev/null +++ b/configs/ssd/ssdlite_mobilenet_v1_300_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1700e.yml', + '_base_/ssdlite_mobilenet_v1_300.yml', + '_base_/ssdlite300_reader.yml', +] +weights: output/ssdlite_mobilenet_v1_300_coco/model_final diff --git a/configs/ssd/ssdlite_mobilenet_v3_large_320_coco.yml b/configs/ssd/ssdlite_mobilenet_v3_large_320_coco.yml new file mode 100644 index 0000000..78d561a --- /dev/null +++ b/configs/ssd/ssdlite_mobilenet_v3_large_320_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1700e.yml', + '_base_/ssdlite_mobilenet_v3_large_320.yml', + '_base_/ssdlite320_reader.yml', +] +weights: output/ssdlite_mobilenet_v3_large_320_coco/model_final diff --git a/configs/ssd/ssdlite_mobilenet_v3_small_320_coco.yml b/configs/ssd/ssdlite_mobilenet_v3_small_320_coco.yml new file mode 100644 index 0000000..fa0ce53 --- /dev/null +++ b/configs/ssd/ssdlite_mobilenet_v3_small_320_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1700e.yml', + '_base_/ssdlite_mobilenet_v3_small_320.yml', + '_base_/ssdlite320_reader.yml', +] +weights: output/ssdlite_mobilenet_v3_small_320_coco/model_final diff --git a/configs/ttfnet/README.md b/configs/ttfnet/README.md new file mode 100644 index 0000000..a20660e --- /dev/null +++ b/configs/ttfnet/README.md @@ -0,0 +1,68 @@ +# 1. TTFNet + +## 简介 + +TTFNet是一种用于实时目标检测且对训练时间友好的网络,对CenterNet收敛速度慢的问题进行改进,提出了利用高斯核生成训练样本的新方法,有效的消除了anchor-free head中存在的模糊性。同时简单轻量化的网络结构也易于进行任务扩展。 + +**特点:** + +结构简单,仅需要两个head检测目标位置和大小,并且去除了耗时的后处理操作 +训练时间短,基于DarkNet53的骨干网路,V100 8卡仅需要训练2个小时即可达到较好的模型效果 + +## Model Zoo + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | 下载 | 配置文件 | +| :-------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| DarkNet53 | TTFNet | 12 | 1x | ---- | 33.5 | [下载链接](https://paddledet.bj.bcebos.com/models/ttfnet_darknet53_1x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ttfnet/ttfnet_darknet53_1x_coco.yml) | + + + + + +# 2. PAFNet + +## 简介 + +PAFNet(Paddle Anchor Free)是PaddleDetection基于TTFNet的优化模型,精度达到anchor free领域SOTA水平,同时产出移动端轻量级模型PAFNet-Lite + +PAFNet系列模型从如下方面优化TTFNet模型: + +- [CutMix](https://arxiv.org/abs/1905.04899) +- 更优的骨干网络: ResNet50vd-DCN +- 更大的训练batch size: 8 GPUs,每GPU batch_size=18 +- Synchronized Batch Normalization +- [Deformable Convolution](https://arxiv.org/abs/1703.06211) +- [Exponential Moving Average](https://www.investopedia.com/terms/e/ema.asp) +- 更优的预训练模型 + + +## 模型库 + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | 下载 | 配置文件 | +| :-------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| ResNet50vd | PAFNet | 18 | 10x | ---- | 39.8 | [下载链接](https://paddledet.bj.bcebos.com/models/pafnet_10x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ttfnet/pafnet_10x_coco.yml) | + + + +### PAFNet-Lite + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 | Box AP | 麒麟990延时(ms) | 体积(M) | 下载 | 配置文件 | +| :-------------- | :------------- | :-----: | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| MobileNetv3 | PAFNet-Lite | 12 | 20x | 23.9 | 26.00 | 14 | [下载链接](https://paddledet.bj.bcebos.com/models/pafnet_lite_mobilenet_v3_20x_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/ttfnet/pafnet_lite_mobilenet_v3_20x_coco.yml) | + +**注意:** 由于动态图框架整体升级,PAFNet的PaddleDetection发布的权重模型评估时需要添加--bias字段, 例如 + +```bash +# 使用PaddleDetection发布的权重 +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/ttfnet/pafnet_10x_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/pafnet_10x_coco.pdparams --bias +``` + +## Citations +``` +@article{liu2019training, + title = {Training-Time-Friendly Network for Real-Time Object Detection}, + author = {Zili Liu, Tu Zheng, Guodong Xu, Zheng Yang, Haifeng Liu, Deng Cai}, + journal = {arXiv preprint arXiv:1909.00700}, + year = {2019} +} +``` diff --git a/configs/ttfnet/_base_/optimizer_10x.yml b/configs/ttfnet/_base_/optimizer_10x.yml new file mode 100644 index 0000000..dd2c29d --- /dev/null +++ b/configs/ttfnet/_base_/optimizer_10x.yml @@ -0,0 +1,19 @@ +epoch: 120 + +LearningRate: + base_lr: 0.015 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [80, 110] + - !LinearWarmup + start_factor: 0.2 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0004 + type: L2 diff --git a/configs/ttfnet/_base_/optimizer_1x.yml b/configs/ttfnet/_base_/optimizer_1x.yml new file mode 100644 index 0000000..8457ead --- /dev/null +++ b/configs/ttfnet/_base_/optimizer_1x.yml @@ -0,0 +1,19 @@ +epoch: 12 + +LearningRate: + base_lr: 0.015 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [8, 11] + - !LinearWarmup + start_factor: 0.2 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0004 + type: L2 diff --git a/configs/ttfnet/_base_/optimizer_20x.yml b/configs/ttfnet/_base_/optimizer_20x.yml new file mode 100644 index 0000000..4dd3492 --- /dev/null +++ b/configs/ttfnet/_base_/optimizer_20x.yml @@ -0,0 +1,20 @@ +epoch: 240 + +LearningRate: + base_lr: 0.015 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [160, 220] + - !LinearWarmup + start_factor: 0.2 + steps: 1000 + +OptimizerBuilder: + clip_grad_by_norm: 35 + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0004 + type: L2 diff --git a/configs/ttfnet/_base_/pafnet.yml b/configs/ttfnet/_base_/pafnet.yml new file mode 100644 index 0000000..5319fe6 --- /dev/null +++ b/configs/ttfnet/_base_/pafnet.yml @@ -0,0 +1,41 @@ +architecture: TTFNet +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams +norm_type: sync_bn +use_ema: true +ema_decay: 0.9998 + +TTFNet: + backbone: ResNet + neck: TTFFPN + ttf_head: TTFHead + post_process: BBoxPostProcess + +ResNet: + depth: 50 + variant: d + return_idx: [0, 1, 2, 3] + freeze_at: -1 + norm_decay: 0. + variant: d + dcn_v2_stages: [1, 2, 3] + +TTFFPN: + planes: [256, 128, 64] + shortcut_num: [3, 2, 1] + +TTFHead: + dcn_head: true + hm_loss: + name: CTFocalLoss + loss_weight: 1. + wh_loss: + name: GIoULoss + loss_weight: 5. + reduction: sum + +BBoxPostProcess: + decode: + name: TTFBox + max_per_img: 100 + score_thresh: 0.01 + down_ratio: 4 diff --git a/configs/ttfnet/_base_/pafnet_lite.yml b/configs/ttfnet/_base_/pafnet_lite.yml new file mode 100644 index 0000000..5ed2fa2 --- /dev/null +++ b/configs/ttfnet/_base_/pafnet_lite.yml @@ -0,0 +1,44 @@ +architecture: TTFNet +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams +norm_type: sync_bn + +TTFNet: + backbone: MobileNetV3 + neck: TTFFPN + ttf_head: TTFHead + post_process: BBoxPostProcess + +MobileNetV3: + scale: 1.0 + model_name: large + feature_maps: [5, 8, 14, 17] + with_extra_blocks: true + lr_mult_list: [0.25, 0.25, 0.5, 0.5, 0.75] + conv_decay: 0.00001 + norm_decay: 0.0 + extra_block_filters: [] + +TTFFPN: + planes: [96, 48, 24] + shortcut_num: [2, 2, 1] + lite_neck: true + fusion_method: concat + +TTFHead: + hm_head_planes: 48 + wh_head_planes: 24 + lite_head: true + hm_loss: + name: CTFocalLoss + loss_weight: 1. + wh_loss: + name: GIoULoss + loss_weight: 5. + reduction: sum + +BBoxPostProcess: + decode: + name: TTFBox + max_per_img: 100 + score_thresh: 0.01 + down_ratio: 4 diff --git a/configs/ttfnet/_base_/pafnet_lite_reader.yml b/configs/ttfnet/_base_/pafnet_lite_reader.yml new file mode 100644 index 0000000..446a13a --- /dev/null +++ b/configs/ttfnet/_base_/pafnet_lite_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - Cutmix: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {aspect_ratio: NULL, cover_all_box: True} + - RandomFlip: {} + - GridMask: {upper_iter: 300000} + batch_transforms: + - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512], random_interp: True, keep_ratio: False} + - NormalizeImage: {mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375], is_scale: false} + - Permute: {} + - Gt2TTFTarget: {down_ratio: 4} + - PadBatch: {pad_to_stride: 32} + batch_size: 12 + shuffle: true + drop_last: true + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [320, 320], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [320, 320], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false diff --git a/configs/ttfnet/_base_/pafnet_reader.yml b/configs/ttfnet/_base_/pafnet_reader.yml new file mode 100644 index 0000000..ea90a13 --- /dev/null +++ b/configs/ttfnet/_base_/pafnet_reader.yml @@ -0,0 +1,40 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - Cutmix: {alpha: 1.5, beta: 1.5} + - RandomDistort: {random_apply: false, random_channel: true} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {aspect_ratio: NULL, cover_all_box: True} + - RandomFlip: {prob: 0.5} + batch_transforms: + - BatchRandomResize: {target_size: [416, 448, 480, 512, 544, 576, 608, 640, 672], keep_ratio: false} + - NormalizeImage: {mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375], is_scale: false} + - Permute: {} + - Gt2TTFTarget: {down_ratio: 4} + - PadBatch: {pad_to_stride: 32} + batch_size: 18 + shuffle: true + drop_last: true + use_shared_memory: true + mixup_epoch: 100 + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [512, 512], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [512, 512], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false diff --git a/configs/ttfnet/_base_/ttfnet_darknet53.yml b/configs/ttfnet/_base_/ttfnet_darknet53.yml new file mode 100644 index 0000000..05c7dce --- /dev/null +++ b/configs/ttfnet/_base_/ttfnet_darknet53.yml @@ -0,0 +1,35 @@ +architecture: TTFNet +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/DarkNet53_pretrained.pdparams + +TTFNet: + backbone: DarkNet + neck: TTFFPN + ttf_head: TTFHead + post_process: BBoxPostProcess + +DarkNet: + depth: 53 + freeze_at: 0 + return_idx: [1, 2, 3, 4] + norm_type: bn + norm_decay: 0.0004 + +TTFFPN: + planes: [256, 128, 64] + shortcut_num: [3, 2, 1] + +TTFHead: + hm_loss: + name: CTFocalLoss + loss_weight: 1. + wh_loss: + name: GIoULoss + loss_weight: 5. + reduction: sum + +BBoxPostProcess: + decode: + name: TTFBox + max_per_img: 100 + score_thresh: 0.01 + down_ratio: 4 diff --git a/configs/ttfnet/_base_/ttfnet_reader.yml b/configs/ttfnet/_base_/ttfnet_reader.yml new file mode 100644 index 0000000..f9ed6cc --- /dev/null +++ b/configs/ttfnet/_base_/ttfnet_reader.yml @@ -0,0 +1,35 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + - RandomFlip: {prob: 0.5} + - Resize: {interp: 1, target_size: [512, 512], keep_ratio: False} + - NormalizeImage: {mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375], is_scale: false} + - Permute: {} + batch_transforms: + - Gt2TTFTarget: {down_ratio: 4} + - PadBatch: {pad_to_stride: 32} + batch_size: 12 + shuffle: true + drop_last: true + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [512, 512], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 1, target_size: [512, 512], keep_ratio: False} + - NormalizeImage: {is_scale: false, mean: [123.675, 116.28, 103.53], std: [58.395, 57.12, 57.375]} + - Permute: {} + batch_size: 1 + drop_last: false + drop_empty: false diff --git a/configs/ttfnet/pafnet_10x_coco.yml b/configs/ttfnet/pafnet_10x_coco.yml new file mode 100644 index 0000000..b14a2bc --- /dev/null +++ b/configs/ttfnet/pafnet_10x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_10x.yml', + '_base_/pafnet.yml', + '_base_/pafnet_reader.yml', +] +weights: output/pafnet_10x_coco/model_final diff --git a/configs/ttfnet/pafnet_lite_mobilenet_v3_20x_coco.yml b/configs/ttfnet/pafnet_lite_mobilenet_v3_20x_coco.yml new file mode 100644 index 0000000..577af16 --- /dev/null +++ b/configs/ttfnet/pafnet_lite_mobilenet_v3_20x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_20x.yml', + '_base_/pafnet_lite.yml', + '_base_/pafnet_lite_reader.yml', +] +weights: output/pafnet_lite_mobilenet_v3_10x_coco/model_final diff --git a/configs/ttfnet/ttfnet_darknet53_1x_coco.yml b/configs/ttfnet/ttfnet_darknet53_1x_coco.yml new file mode 100644 index 0000000..5912392 --- /dev/null +++ b/configs/ttfnet/ttfnet_darknet53_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/ttfnet_darknet53.yml', + '_base_/ttfnet_reader.yml', +] +weights: output/ttfnet_darknet53_1x_coco/model_final diff --git a/configs/vehicle/README.md b/configs/vehicle/README.md new file mode 100644 index 0000000..56e5e19 --- /dev/null +++ b/configs/vehicle/README.md @@ -0,0 +1,53 @@ +English | [简体中文](README_cn.md) +# PaddleDetection applied for specific scenarios + +We provide some models implemented by PaddlePaddle to detect objects in specific scenarios, users can download the models and use them in these scenarios. + +| Task | Algorithm | Box AP | Download | Configs | +|:---------------------|:---------:|:------:| :-------------------------------------------------------------------------------------: |:------:| +| Vehicle Detection | YOLOv3 | 54.5 | [model](https://paddledet.bj.bcebos.com/models/vehicle_yolov3_darknet.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/vehicle/vehicle_yolov3_darknet.yml) | + +## Vehicle Detection + +One of major applications of vehichle detection is traffic monitoring. In this scenary, vehicles to be detected are mostly captured by the cameras mounted on top of traffic light columns. + +### 1. Network + +The network for detecting vehicles is YOLOv3, the backbone of which is Dacknet53. + +### 2. Configuration for training + +PaddleDetection provides users with a configuration file [yolov3_darknet53_270e_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) to train YOLOv3 on the COCO dataset, compared with this file, we modify some parameters as followed to conduct the training for vehicle detection: + +* num_classes: 6 +* anchors: [[8, 9], [10, 23], [19, 15], [23, 33], [40, 25], [54, 50], [101, 80], [139, 145], [253, 224]] +* nms/nms_top_k: 400 +* nms/score_threshold: 0.005 +* dataset_dir: dataset/vehicle + +### 3. Accuracy + +The accuracy of the model trained and evaluated on our private data is shown as followed: + +AP at IoU=.50:.05:.95 is 0.545. + +AP at IoU=.50 is 0.764. + +### 4. Inference + +Users can employ the model to conduct the inference: + +``` +export CUDA_VISIBLE_DEVICES=0 +python -u tools/infer.py -c configs/vehicle/vehicle_yolov3_darknet.yml \ + -o weights=https://paddledet.bj.bcebos.com/models/vehicle_yolov3_darknet.pdparams \ + --infer_dir configs/vehicle/demo \ + --draw_threshold 0.2 \ + --output_dir configs/vehicle/demo/output +``` + +Some inference results are visualized below: + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/VehicleDetection_001.jpeg) + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/VehicleDetection_005.png) diff --git a/configs/vehicle/README_cn.md b/configs/vehicle/README_cn.md new file mode 100644 index 0000000..5fd7f66 --- /dev/null +++ b/configs/vehicle/README_cn.md @@ -0,0 +1,54 @@ +[English](README.md) | 简体中文 +# 特色垂类检测模型 + +我们提供了针对不同场景的基于PaddlePaddle的检测模型,用户可以下载模型进行使用。 + +| 任务 | 算法 | 精度(Box AP) | 下载 | 配置文件 | +|:---------------------|:---------:|:------:| :---------------------------------------------------------------------------------: | :------:| +| 车辆检测 | YOLOv3 | 54.5 | [下载链接](https://paddledet.bj.bcebos.com/models/vehicle_yolov3_darknet.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/vehicle/vehicle_yolov3_darknet.yml) | + + +## 车辆检测(Vehicle Detection) + +车辆检测的主要应用之一是交通监控。在这样的监控场景中,待检测的车辆多为道路红绿灯柱上的摄像头拍摄所得。 + +### 1. 模型结构 + +Backbone为Dacknet53的YOLOv3。 + +### 2. 训练参数配置 + +PaddleDetection提供了使用COCO数据集对YOLOv3进行训练的参数配置文件[yolov3_darknet53_270e_coco.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml),与之相比,在进行车辆检测的模型训练时,我们对以下参数进行了修改: + +* num_classes: 6 +* anchors: [[8, 9], [10, 23], [19, 15], [23, 33], [40, 25], [54, 50], [101, 80], [139, 145], [253, 224]] +* nms/nms_top_k: 400 +* nms/score_threshold: 0.005 +* dataset_dir: dataset/vehicle + +### 3. 精度指标 + +模型在我们内部数据上的精度指标为: + +IOU=.50:.05:.95时的AP为 0.545。 + +IOU=.5时的AP为 0.764。 + +### 4. 预测 + +用户可以使用我们训练好的模型进行车辆检测: + +``` +export CUDA_VISIBLE_DEVICES=0 +python -u tools/infer.py -c configs/vehicle/vehicle_yolov3_darknet.yml \ + -o weights=https://paddledet.bj.bcebos.com/models/vehicle_yolov3_darknet.pdparams \ + --infer_dir configs/vehicle/demo \ + --draw_threshold 0.2 \ + --output_dir configs/vehicle/demo/output +``` + +预测结果示例: + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/VehicleDetection_001.jpeg) + +![](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/static/docs/images/VehicleDetection_005.png) diff --git a/configs/vehicle/demo/001.jpeg b/configs/vehicle/demo/001.jpeg new file mode 100644 index 0000000..8786db5 Binary files /dev/null and b/configs/vehicle/demo/001.jpeg differ diff --git a/configs/vehicle/demo/003.png b/configs/vehicle/demo/003.png new file mode 100644 index 0000000..c01ab4c Binary files /dev/null and b/configs/vehicle/demo/003.png differ diff --git a/configs/vehicle/demo/004.png b/configs/vehicle/demo/004.png new file mode 100644 index 0000000..8907eb8 Binary files /dev/null and b/configs/vehicle/demo/004.png differ diff --git a/configs/vehicle/demo/005.png b/configs/vehicle/demo/005.png new file mode 100644 index 0000000..bf17712 Binary files /dev/null and b/configs/vehicle/demo/005.png differ diff --git a/configs/vehicle/vehicle.json b/configs/vehicle/vehicle.json new file mode 100644 index 0000000..5863a9a --- /dev/null +++ b/configs/vehicle/vehicle.json @@ -0,0 +1,36 @@ +{ + "images": [], + "annotations": [], + "categories": [ + { + "supercategory": "component", + "id": 1, + "name": "car" + }, + { + "supercategory": "component", + "id": 2, + "name": "truck" + }, + { + "supercategory": "component", + "id": 3, + "name": "bus" + }, + { + "supercategory": "component", + "id": 4, + "name": "motorbike" + }, + { + "supercategory": "component", + "id": 5, + "name": "tricycle" + }, + { + "supercategory": "component", + "id": 6, + "name": "carplate" + } + ] +} diff --git a/configs/vehicle/vehicle_yolov3_darknet.yml b/configs/vehicle/vehicle_yolov3_darknet.yml new file mode 100644 index 0000000..17f401a --- /dev/null +++ b/configs/vehicle/vehicle_yolov3_darknet.yml @@ -0,0 +1,42 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '../yolov3/_base_/optimizer_270e.yml', + '../yolov3/_base_/yolov3_darknet53.yml', + '../yolov3/_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: https://paddledet.bj.bcebos.com/models/vehicle_yolov3_darknet.pdparams + +YOLOv3Head: + anchors: [[8, 9], [10, 23], [19, 15], + [23, 33], [40, 25], [54, 50], + [101, 80], [139, 145], [253, 224]] + +BBoxPostProcess: + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.005 + nms_threshold: 0.45 + nms_top_k: 400 + +num_classes: 6 + +TrainDataset: + !COCODataSet + dataset_dir: dataset/vehicle + anno_path: annotations/instances_train2017.json + image_dir: train2017 + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + dataset_dir: dataset/vehicle + anno_path: annotations/instances_val2017.json + image_dir: val2017 + +TestDataset: + !ImageFolder + anno_path: configs/vehicle/vehicle.json diff --git a/configs/yolov3/README.md b/configs/yolov3/README.md new file mode 100644 index 0000000..e4408c5 --- /dev/null +++ b/configs/yolov3/README.md @@ -0,0 +1,70 @@ +# YOLOv3 + +## Model Zoo + +### YOLOv3 on COCO + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | 下载 | 配置文件 | +| :------------------- | :------- | :-----: | :-----: | :------------: | :-----: | :-----------------------------------------------------: | :-----: | +| DarkNet53(paper) | 608 | 8 | 270e | ---- | 33.0 | - | - | +| DarkNet53(paper) | 416 | 8 | 270e | ---- | 31.0 | - | - | +| DarkNet53(paper) | 320 | 8 | 270e | ---- | 28.2 | - | - | +| DarkNet53 | 608 | 8 | 270e | ---- | 39.0 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) | +| DarkNet53 | 416 | 8 | 270e | ---- | 37.5 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) | +| DarkNet53 | 320 | 8 | 270e | ---- | 34.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_darknet53_270e_coco.yml) | +| ResNet50_vd | 608 | 8 | 270e | ---- | 39.1 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r50vd_dcn_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml) | +| ResNet50_vd | 416 | 8 | 270e | ---- | 36.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r50vd_dcn_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml) | +| ResNet50_vd | 320 | 8 | 270e | ---- | 33.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r50vd_dcn_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml) | +| ResNet34 | 608 | 8 | 270e | ---- | 36.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r34_270e_coco.yml) | +| ResNet34 | 416 | 8 | 270e | ---- | 34.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r34_270e_coco.yml) | +| ResNet34 | 320 | 8 | 270e | ---- | 31.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_r34_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_r34_270e_coco.yml) | +| MobileNet-V1 | 608 | 8 | 270e | ---- | 29.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | +| MobileNet-V1 | 416 | 8 | 270e | ---- | 29.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | +| MobileNet-V1 | 320 | 8 | 270e | ---- | 27.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml) | +| MobileNet-V3 | 608 | 8 | 270e | ---- | 31.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml) | +| MobileNet-V3 | 416 | 8 | 270e | ---- | 29.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml) | +| MobileNet-V3 | 320 | 8 | 270e | ---- | 27.1 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml) | +| MobileNet-V1-SSLD | 608 | 8 | 270e | ---- | 31.0 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) | +| MobileNet-V1-SSLD | 416 | 8 | 270e | ---- | 30.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) | +| MobileNet-V1-SSLD | 320 | 8 | 270e | ---- | 28.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_coco.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml) | + +### YOLOv3 on Pasacl VOC + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | 下载 | 配置文件 | +| :----------- | :--: | :-----: | :-----: |:------------: |:----: | :-------: | :----: | +| MobileNet-V1 | 608 | 8 | 270e | - | 75.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml) | +| MobileNet-V1 | 416 | 8 | 270e | - | 76.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml) | +| MobileNet-V1 | 320 | 8 | 270e | - | 74.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml) | +| MobileNet-V3 | 608 | 8 | 270e | - | 79.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml) | +| MobileNet-V3 | 416 | 8 | 270e | - | 78.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml) | +| MobileNet-V3 | 320 | 8 | 270e | - | 76.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml) | +| MobileNet-V1-SSLD | 608 | 8 | 270e | - | 78.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) | +| MobileNet-V1-SSLD | 416 | 8 | 270e | - | 79.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) | +| MobileNet-V1-SSLD | 320 | 8 | 270e | - | 77.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml) | +| MobileNet-V3-SSLD | 608 | 8 | 270e | - | 80.4 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) | +| MobileNet-V3-SSLD | 416 | 8 | 270e | - | 79.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) | +| MobileNet-V3-SSLD | 320 | 8 | 270e | - | 77.3 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v3_large_ssld_270e_voc.pdparams) | [配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.0/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml) | + +**注意:** YOLOv3均使用8GPU训练,训练270个epoch。由于动态图框架整体升级,以下几个PaddleDetection发布的权重模型评估时需要添加--bias字段, 例如 + +```bash +# 使用PaddleDetection发布的权重 +CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --bias +``` +主要有: + +1.yolov3_darknet53_270e_coco + +2.yolov3_r50vd_dcn_270e_coco + +## Citations +``` +@misc{redmon2018yolov3, + title={YOLOv3: An Incremental Improvement}, + author={Joseph Redmon and Ali Farhadi}, + year={2018}, + eprint={1804.02767}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/yolov3/_base_/optimizer_270e.yml b/configs/yolov3/_base_/optimizer_270e.yml new file mode 100644 index 0000000..d92f3df --- /dev/null +++ b/configs/yolov3/_base_/optimizer_270e.yml @@ -0,0 +1,21 @@ +epoch: 270 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 216 + - 243 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/yolov3/_base_/yolov3_darknet53.yml b/configs/yolov3/_base_/yolov3_darknet53.yml new file mode 100644 index 0000000..1187f6e --- /dev/null +++ b/configs/yolov3/_base_/yolov3_darknet53.yml @@ -0,0 +1,41 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/DarkNet53_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: DarkNet + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +DarkNet: + depth: 53 + return_idx: [2, 3, 4] + +# use default config +# YOLOv3FPN: + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_mobilenet_v1.yml b/configs/yolov3/_base_/yolov3_mobilenet_v1.yml new file mode 100644 index 0000000..6452b51 --- /dev/null +++ b/configs/yolov3/_base_/yolov3_mobilenet_v1.yml @@ -0,0 +1,43 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV1_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: MobileNet + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNet: + scale: 1 + feature_maps: [4, 6, 13] + with_extra_blocks: false + extra_block_filters: [] + +# use default config +# YOLOv3FPN: + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_mobilenet_v3_large.yml b/configs/yolov3/_base_/yolov3_mobilenet_v3_large.yml new file mode 100644 index 0000000..94b5dea --- /dev/null +++ b/configs/yolov3/_base_/yolov3_mobilenet_v3_large.yml @@ -0,0 +1,44 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: MobileNetV3 + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNetV3: + model_name: large + scale: 1. + with_extra_blocks: false + extra_block_filters: [] + feature_maps: [7, 13, 16] + +# use default config +# YOLOv3FPN: + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_mobilenet_v3_small.yml b/configs/yolov3/_base_/yolov3_mobilenet_v3_small.yml new file mode 100644 index 0000000..f0f144b --- /dev/null +++ b/configs/yolov3/_base_/yolov3_mobilenet_v3_small.yml @@ -0,0 +1,44 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_small_x1_0_ssld_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: MobileNetV3 + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +MobileNetV3: + model_name: small + scale: 1. + with_extra_blocks: false + extra_block_filters: [] + feature_maps: [4, 9, 12] + +# use default config +# YOLOv3FPN: + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_r34.yml b/configs/yolov3/_base_/yolov3_r34.yml new file mode 100644 index 0000000..c2d1489 --- /dev/null +++ b/configs/yolov3/_base_/yolov3_r34.yml @@ -0,0 +1,41 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet34_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: ResNet + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +ResNet: + depth: 34 + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_r50vd_dcn.yml b/configs/yolov3/_base_/yolov3_r50vd_dcn.yml new file mode 100644 index 0000000..0d01148 --- /dev/null +++ b/configs/yolov3/_base_/yolov3_r50vd_dcn.yml @@ -0,0 +1,45 @@ +architecture: YOLOv3 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_pretrained.pdparams +norm_type: sync_bn + +YOLOv3: + backbone: ResNet + neck: YOLOv3FPN + yolo_head: YOLOv3Head + post_process: BBoxPostProcess + +ResNet: + depth: 50 + variant: d + return_idx: [1, 2, 3] + dcn_v2_stages: [3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +# YOLOv3FPN: + +YOLOv3Head: + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + loss: YOLOv3Loss + +YOLOv3Loss: + ignore_thresh: 0.7 + downsample: [32, 16, 8] + label_smooth: false + +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.01 + nms_threshold: 0.45 + nms_top_k: 1000 diff --git a/configs/yolov3/_base_/yolov3_reader.yml b/configs/yolov3/_base_/yolov3_reader.yml new file mode 100644 index 0000000..f0130c1 --- /dev/null +++ b/configs/yolov3/_base_/yolov3_reader.yml @@ -0,0 +1,45 @@ +worker_num: 2 +TrainReader: + inputs_def: + num_max_boxes: 50 + sample_transforms: + - Decode: {} + - Mixup: {alpha: 1.5, beta: 1.5} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeBox: {} + - PadBox: {num_max_boxes: 50} + - BboxXYXY2XYWH: {} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + - Gt2YoloTarget: {anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]], anchors: [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], downsample_ratios: [32, 16, 8]} + batch_size: 8 + shuffle: true + drop_last: true + mixup_epoch: 250 + use_shared_memory: true + +EvalReader: + inputs_def: + num_max_boxes: 50 + sample_transforms: + - Decode: {} + - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 + drop_empty: false + +TestReader: + inputs_def: + image_shape: [3, 608, 608] + sample_transforms: + - Decode: {} + - Resize: {target_size: [608, 608], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True} + - Permute: {} + batch_size: 1 diff --git a/configs/yolov3/yolov3_darknet53_270e_coco.yml b/configs/yolov3/yolov3_darknet53_270e_coco.yml new file mode 100644 index 0000000..4fbd401 --- /dev/null +++ b/configs/yolov3/yolov3_darknet53_270e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_darknet53.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_darknet53_270e_coco/model_final diff --git a/configs/yolov3/yolov3_darknet53_270e_voc.yml b/configs/yolov3/yolov3_darknet53_270e_voc.yml new file mode 100644 index 0000000..e24c01e --- /dev/null +++ b/configs/yolov3/yolov3_darknet53_270e_voc.yml @@ -0,0 +1,14 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_darknet53.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_darknet53_270e_voc/model_final + +EvalReader: + batch_transforms: + - PadBatch: {pad_gt: True} diff --git a/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml b/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml new file mode 100644 index 0000000..b9dd33b --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v1_270e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v1.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_mobilenet_v1_270e_coco/model_final diff --git a/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml new file mode 100644 index 0000000..7b25cd0 --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml @@ -0,0 +1,22 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v1.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_mobilenet_v1_270e_voc/model_final + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 216 + - 243 + - !LinearWarmup + start_factor: 0. + steps: 1000 diff --git a/configs/yolov3/yolov3_mobilenet_v1_roadsign.yml b/configs/yolov3/yolov3_mobilenet_v1_roadsign.yml new file mode 100644 index 0000000..d899375 --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v1_roadsign.yml @@ -0,0 +1,33 @@ +_BASE_: [ + '../datasets/roadsign_voc.yml', + '../runtime.yml', + '_base_/yolov3_mobilenet_v1.yml', + '_base_/yolov3_reader.yml', +] +pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov3_mobilenet_v1_270e_coco.pdparams +weights: output/yolov3_mobilenet_v1_roadsign/model_final + +YOLOv3Loss: + ignore_thresh: 0.7 + label_smooth: true + +snapshot_epoch: 2 +epoch: 40 + +LearningRate: + base_lr: 0.0001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [32, 36] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 100 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 diff --git a/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml new file mode 100644 index 0000000..10cf816 --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_coco.yml @@ -0,0 +1,11 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v1.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV1_ssld_pretrained.pdparams +weights: output/yolov3_mobilenet_v1_ssld_270e_coco/model_final diff --git a/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml new file mode 100644 index 0000000..7a3e62f --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml @@ -0,0 +1,23 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v1.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV1_ssld_pretrained.pdparams +weights: output/yolov3_mobilenet_v1_ssld_270e_voc/model_final + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 216 + - 243 + - !LinearWarmup + start_factor: 0. + steps: 1000 diff --git a/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml b/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml new file mode 100644 index 0000000..d1b8af5 --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v3_large_270e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v3_large.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_mobilenet_v3_large_270e_coco/model_final diff --git a/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml new file mode 100644 index 0000000..abf492e --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml @@ -0,0 +1,22 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v3_large.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_mobilenet_v3_large_270e_voc/model_final + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 216 + - 243 + - !LinearWarmup + start_factor: 0. + steps: 1000 diff --git a/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml new file mode 100644 index 0000000..6d183e3 --- /dev/null +++ b/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml @@ -0,0 +1,23 @@ +_BASE_: [ + '../datasets/voc.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_mobilenet_v3_large.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams +weights: output/yolov3_mobilenet_v3_large_ssld_270e_voc/model_final + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 216 + - 243 + - !LinearWarmup + start_factor: 0. + steps: 1000 diff --git a/configs/yolov3/yolov3_r34_270e_coco.yml b/configs/yolov3/yolov3_r34_270e_coco.yml new file mode 100644 index 0000000..8653b06 --- /dev/null +++ b/configs/yolov3/yolov3_r34_270e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_r34.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_r34_270e_coco/model_final diff --git a/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml b/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml new file mode 100644 index 0000000..a07cbdd --- /dev/null +++ b/configs/yolov3/yolov3_r50vd_dcn_270e_coco.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_r50vd_dcn.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_r50vd_dcn_270e_coco/model_final diff --git a/contrib/VehicleDetection/vehicle_yolov3_darknet.yml b/contrib/VehicleDetection/vehicle_yolov3_darknet.yml new file mode 100644 index 0000000..825f1c9 --- /dev/null +++ b/contrib/VehicleDetection/vehicle_yolov3_darknet.yml @@ -0,0 +1,10 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_270e.yml', + '_base_/yolov3_darknet53.yml', + '_base_/yolov3_reader.yml', +] + +snapshot_epoch: 5 +weights: output/yolov3_darknet53_270e_coco/model_final \ No newline at end of file diff --git a/dataset/coco/download_coco.py b/dataset/coco/download_coco.py new file mode 100644 index 0000000..47659fa --- /dev/null +++ b/dataset/coco/download_coco.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PadleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'coco') diff --git a/dataset/roadsign_voc/download_roadsign_voc.py b/dataset/roadsign_voc/download_roadsign_voc.py new file mode 100644 index 0000000..3cb517d --- /dev/null +++ b/dataset/roadsign_voc/download_roadsign_voc.py @@ -0,0 +1,28 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PadleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'roadsign_voc') diff --git a/dataset/roadsign_voc/label_list.txt b/dataset/roadsign_voc/label_list.txt new file mode 100644 index 0000000..1be460f --- /dev/null +++ b/dataset/roadsign_voc/label_list.txt @@ -0,0 +1,4 @@ +speedlimit +crosswalk +trafficlight +stop \ No newline at end of file diff --git a/dataset/voc/create_list.py b/dataset/voc/create_list.py new file mode 100644 index 0000000..5ab8042 --- /dev/null +++ b/dataset/voc/create_list.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PadleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import create_voc_list + +logging.basicConfig(level=logging.INFO) + +voc_path = osp.split(osp.realpath(sys.argv[0]))[0] +create_voc_list(voc_path) diff --git a/dataset/voc/download_voc.py b/dataset/voc/download_voc.py new file mode 100644 index 0000000..e4c449c --- /dev/null +++ b/dataset/voc/download_voc.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PadleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'voc') diff --git a/dataset/voc/label_list.txt b/dataset/voc/label_list.txt new file mode 100644 index 0000000..8420ab3 --- /dev/null +++ b/dataset/voc/label_list.txt @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/dataset/wider_face/download_wider_face.sh b/dataset/wider_face/download_wider_face.sh new file mode 100644 index 0000000..59a2054 --- /dev/null +++ b/dataset/wider_face/download_wider_face.sh @@ -0,0 +1,21 @@ +# All rights `PaddleDetection` reserved +# References: +# @inproceedings{yang2016wider, +# Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou}, +# Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, +# Title = {WIDER FACE: A Face Detection Benchmark}, +# Year = {2016}} + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +echo "Downloading..." +wget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip +wget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip +wget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip +# Extract the data. +echo "Extracting..." +unzip -q WIDER_train.zip +unzip -q WIDER_val.zip +unzip -q wider_face_split.zip diff --git a/demo/00000001.jpg b/demo/00000001.jpg new file mode 100644 index 0000000..b8d24aa Binary files /dev/null and b/demo/00000001.jpg differ diff --git a/demo/00000002.jpg b/demo/00000002.jpg new file mode 100644 index 0000000..3533a1d Binary files /dev/null and b/demo/00000002.jpg differ diff --git a/demo/00000003.jpg b/demo/00000003.jpg new file mode 100644 index 0000000..07ccdb3 Binary files /dev/null and b/demo/00000003.jpg differ diff --git a/demo/00000004.jpg b/demo/00000004.jpg new file mode 100644 index 0000000..dc75ddf Binary files /dev/null and b/demo/00000004.jpg differ diff --git a/demo/00000005.jpg b/demo/00000005.jpg new file mode 100644 index 0000000..0304899 Binary files /dev/null and b/demo/00000005.jpg differ diff --git a/demo/00000006.jpg b/demo/00000006.jpg new file mode 100644 index 0000000..3e9885c Binary files /dev/null and b/demo/00000006.jpg differ diff --git a/demo/00000007.jpg b/demo/00000007.jpg new file mode 100644 index 0000000..7607eee Binary files /dev/null and b/demo/00000007.jpg differ diff --git a/demo/output/00000001.jpg b/demo/output/00000001.jpg new file mode 100644 index 0000000..a769387 Binary files /dev/null and b/demo/output/00000001.jpg differ diff --git a/demo/output/00000002.jpg b/demo/output/00000002.jpg new file mode 100644 index 0000000..454f227 Binary files /dev/null and b/demo/output/00000002.jpg differ diff --git a/demo/output/00000003.jpg b/demo/output/00000003.jpg new file mode 100644 index 0000000..35e38c3 Binary files /dev/null and b/demo/output/00000003.jpg differ diff --git a/demo/output/00000004.jpg b/demo/output/00000004.jpg new file mode 100644 index 0000000..5cdd44d Binary files /dev/null and b/demo/output/00000004.jpg differ diff --git a/demo/output/00000005.jpg b/demo/output/00000005.jpg new file mode 100644 index 0000000..75693b4 Binary files /dev/null and b/demo/output/00000005.jpg differ diff --git a/demo/output/00000006.jpg b/demo/output/00000006.jpg new file mode 100644 index 0000000..9dff9cd Binary files /dev/null and b/demo/output/00000006.jpg differ diff --git a/demo/output/00000007.jpg b/demo/output/00000007.jpg new file mode 100644 index 0000000..57d249f Binary files /dev/null and b/demo/output/00000007.jpg differ diff --git a/deploy/BENCHMARK_INFER.md b/deploy/BENCHMARK_INFER.md new file mode 100644 index 0000000..988cf30 --- /dev/null +++ b/deploy/BENCHMARK_INFER.md @@ -0,0 +1,60 @@ +# 推理Benchmark + +## 一、环境准备 +- 1、测试环境: + - CUDA 10.1 + - CUDNN 7.6 + - TensorRT-6.0.1 + - PaddlePaddle v2.0.1 + - GPU分别为: Tesla V100和GTX 1080Ti和Jetson AGX Xavier +- 2、测试方式: + - 为了方便比较不同模型的推理速度,输入采用同样大小的图片,为 3x640x640,采用 `demo/000000014439_640x640.jpg` 图片。 + - Batch Size=1 + - 去掉前100轮warmup时间,测试100轮的平均时间,单位ms/image,包括网络计算时间、数据拷贝至CPU的时间。 + - 采用Fluid C++预测引擎: 包含Fluid C++预测、Fluid-TensorRT预测,下面同时测试了Float32 (FP32) 和Float16 (FP16)的推理速度。 + +**注意:** TensorRT中固定尺寸和动态尺寸区别请参考文档[TENSOR教程](TENSOR_RT.md)。由于固定尺寸下对两阶段模型支持不完善,所以faster rcnn模型采用动态尺寸测试。固定尺寸和动态尺寸支持融合的OP不完全一样,因此同一个模型在固定尺寸和动态尺寸下测试的性能可能会有一点差异。 + +## 二、推理速度 + +### 1、Linux系统 +#### (1)Tesla V100 + +| 模型 | backbone | 是否固定尺寸 | 入网尺寸 | paddle_inference | trt_fp32 | trt_fp16 | +|-------------------------------|--------------|--------|----------|------------------|----------|----------| +| Faster RCNN FPN | ResNet50 | 否 | 640x640 | 27.99 | 26.15 | 21.92 | +| Faster RCNN FPN | ResNet50 | 否 | 800x1312 | 32.49 | 25.54 | 21.70 | +| YOLOv3 | Mobilenet\_v1 | 是 | 608x608 | 9.74 | 8.61 | 6.28 | +| YOLOv3 | Darknet53 | 是 | 608x608 | 17.84 | 15.43 | 9.86 | +| PPYOLO | ResNet50 | 是 | 608x608 | 20.77 | 18.40 | 13.53 | +| SSD | Mobilenet\_v1 | 是 | 300x300 | 5.17 | 4.43 | 4.29 | +| TTFNet | Darknet53 | 是 | 512x512 | 10.14 | 8.71 | 5.55 | +| FCOS | ResNet50 | 是 | 640x640 | 35.47 | 35.02 | 34.24 | + + +#### (2)Jetson AGX Xavier + +| 模型 | backbone | 是否固定尺寸 | 入网尺寸 | paddle_inference | trt_fp32 | trt_fp16 | +|-------------------------------|--------------|--------|----------|------------------|----------|----------| +| Faster RCNN FPN | ResNet50 | 否 | 640x640 | 169.45 | 158.92 | 119.25 | +| Faster RCNN FPN | ResNet50 | 否 | 800x1312 | 228.07 | 156.39 | 117.03 | +| YOLOv3 | Mobilenet\_v1 | 是 | 608x608 | 48.76 | 43.83 | 18.41 | +| YOLOv3 | Darknet53 | 是 | 608x608 | 121.61 | 110.30 | 42.38 | +| PPYOLO | ResNet50 | 是 | 608x608 | 111.80 | 99.40 | 48.05 | +| SSD | Mobilenet\_v1 | 是 | 300x300 | 10.52 | 8.84 | 8.77 | +| TTFNet | Darknet53 | 是 | 512x512 | 73.77 | 64.03 | 31.46 | +| FCOS | ResNet50 | 是 | 640x640 | 217.11 | 214.38 | 205.78 | + +### 2、Windows系统 +#### (1)GTX 1080Ti + +| 模型 | backbone | 是否固定尺寸 | 入网尺寸 | paddle_inference | trt_fp32 | trt_fp16 | +|-------------------------------|--------------|--------|----------|------------------|----------|----------| +| Faster RCNN FPN | ResNet50 | 否 | 640x640 | 50.74 | 57.17 | 62.08 | +| Faster RCNN FPN | ResNet50 | 否 | 800x1312 | 50.31 | 57.61 | 62.05 | +| YOLOv3 | Mobilenet\_v1 | 是 | 608x608 | 14.51 | 11.23 | 11.13 | +| YOLOv3 | Darknet53 | 是 | 608x608 | 30.26 | 23.92 | 24.02 | +| PPYOLO | ResNet50 | 是 | 608x608 | 38.06 | 31.40 | 31.94 | +| SSD | Mobilenet\_v1 | 是 | 300x300 | 16.47 | 13.87 | 13.76 | +| TTFNet | Darknet53 | 是 | 512x512 | 21.83 | 17.14 | 17.09 | +| FCOS | ResNet50 | 是 | 640x640 | 71.88 | 69.93 | 69.52 | diff --git a/deploy/EXPORT_MODEL.md b/deploy/EXPORT_MODEL.md new file mode 100644 index 0000000..50f50cb --- /dev/null +++ b/deploy/EXPORT_MODEL.md @@ -0,0 +1,55 @@ +# 模型导出教程 + +## 一、模型导出 +本章节介绍如何使用`tools/export_model.py`脚本导出模型。 + +### 1、导出模输入输出说明 +- 输入变量以及输入形状如下: + + | 输入名称 | 输入形状 | 表示含义 | + | :---------: | ----------- | ---------- | + | image | [None, 3, H, W] | 输入网络的图像,None表示batch维度,如果输入图像大小为变长,则H,W为None | + | im_shape | [None, 2] | 图像经过resize后的大小,表示为H,W, None表示batch维度 | + | scale_factor | [None, 2] | 输入图像大小比真实图像大小,表示为scale_y, scale_x | + + **注意** : 具体预处理方式可参考配置文件中TestReader部分。 + + +- 动转静导出模型输出统一为: + + - bbox, NMS的输出,形状为[N, 6], 其中N为预测框的个数,6为[class_id, score, x1, y1, x2, y2]。 + - bbox\_num, 每张图片对应预测框的个数,例如batch_size为2,输出为[N1, N2], 表示第一张图包含N1个预测框,第二张图包含N2个预测框,并且预测框的总个数和NMS输出的第一维N相同 + - mask,如果网络中包含mask,则会输出mask分支 + + **注意**模型动转静导出不支持模型结构中包含numpy相关操作的情况。 + + +### 2、启动参数说明 + +| FLAG | 用途 | 默认值 | 备注 | +|:--------------:|:--------------:|:------------:|:-----------------------------------------:| +| -c | 指定配置文件 | None | | +| --output_dir | 模型保存路径 | `./output_inference` | 模型默认保存在`output/配置文件名/`路径下 | + +### 3、使用示例 + +使用训练得到的模型进行试用,脚本如下 + +```bash +# 导出YOLOv3模型 +python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \ + -o weights=weights/yolov3_darknet53_270e_coco.pdparams +``` + +预测模型会导出到`inference_model/yolov3_darknet53_270e_coco`目录下,分别为`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`。 + + +### 4、设置导出模型的输入大小 + +使用Fluid-TensorRT进行预测时,由于<=TensorRT 5.1的版本仅支持定长输入,保存模型的`data`层的图片大小需要和实际输入图片大小一致。而Fluid C++预测引擎没有此限制。设置TestReader中的`image_shape`可以修改保存模型中的输入图片大小。示例如下: + +```bash +# 导出YOLOv3模型,输入是3x640x640 +python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --output_dir=./inference_model \ + -o weights=weights/yolov3_darknet53_270e_coco.pdparams TestReader.inputs_def.image_shape=[3,640,640] +``` diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..b026ded --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,80 @@ +# PaddleDetection 预测部署 +训练得到一个满足要求的模型后,如果想要将该模型部署到已选择的平台上,需要通过`tools/export_model.py`将模型导出预测部署的模型和配置文件。 +并在同一文件夹下导出预测时使用的配置文件,配置文件名为`infer_cfg.yml`。 + +## 1、`PaddleDetection`目前支持的部署方式按照部署设备可以分为: +- 在本机`python`语言部署,支持在有`python paddle`(支持`CPU`、`GPU`)环境下部署,有两种方式: + - 使用`tools/infer.py`,此种方式依赖`PaddleDetection`代码库。 + - 将模型导出,使用`deploy/python/infer.py`,此种方式不依赖`PaddleDetection`代码库,可以单个`python`文件部署。 +- 在本机`C++`语言使用`paddle inference`预测库部署,支持在`Linux`和`Windows`系统下部署。请参考文档[C++部署](cpp/README.md)。 +- 在服务器端以服务形式部署,使用[PaddleServing](./serving/README.md)部署。 +- 在手机移动端部署,使用[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在手机移动端部署。 + 常见模型部署Demo请参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo) 。 +- `NV Jetson`嵌入式设备上部署 +- `TensorRT`加速请参考文档[TensorRT预测部署教程](TENSOR_RT.md) + +## 2、模型导出 +使用`tools/export_model.py`脚本导出模型已经部署时使用的配置文件,配置文件名字为`infer_cfg.yml`。模型导出脚本如下: +```bash +# 导出YOLOv3模型 +python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=weights/yolov3_darknet53_270e_coco.pdparams +``` +预测模型会导出到`output_inference/yolov3_darknet53_270e_coco`目录下,分别为`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`。 + +如果需要导出`PaddleServing`格式的模型,需要设置`export_serving_model=True`: +```buildoutcfg +python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=weights/yolov3_darknet53_270e_coco.pdparams --export_serving_model=True +``` +预测模型会导出到`output_inference/yolov3_darknet53_270e_coco`目录下,分别为`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`, `serving_client/`文件夹, `serving_server/`文件夹。 + +模型导出具体请参考文档[PaddleDetection模型导出教程](EXPORT_MODEL.md)。 + +## 3、如何选择部署时依赖库的版本 + +### (1)CUDA、cuDNN、TensorRT版本选择 +由于CUDA、cuDNN、TENSORRT不一定都是向前兼容的,需要使用与编译Paddle预测库使用的环境完全一致的环境进行部署。 + +### (2)部署时预测库版本、预测引擎版本选择 + +- Linux、Windows平台下C++部署,需要使用Paddle预测库进行部署。 + (1)Paddle官网提供在不同平台、不同环境下编译好的预测库,您可以直接使用,请在这里[Paddle预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 选择。 + (2)如果您将要部署的平台环境,Paddle官网上没有提供已编译好的预测库,您可以自行编译,编译过程请参考[Paddle源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html)。 + +**注意:** Paddle预测库版本需要>=2.0 + +- Python语言部署,需要在对应平台上安装Paddle Python包。如果Paddle官网上没有提供该平台下的Paddle Python包,您可以自行编译,编译过程请参考[Paddle源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html)。 + +- PaddleServing部署 + PaddleServing 0.4.0是基于Paddle 1.8.4开发,PaddleServing 0.5.0是基于Paddle2.0开发。 + +- Paddle-Lite部署 + Paddle-Lite支持OP列表请参考:[Paddle-Lite支持的OP列表](https://paddle-lite.readthedocs.io/zh/latest/source_compile/library.html) ,请跟进所部署模型中使用到的op选择Paddle-Lite版本。 + +- NV Jetson部署 + Paddle官网提供在NV Jetson平台上已经编译好的预测库,[Paddle NV Jetson预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 。 + 若列表中没有您需要的预测库,您可以在您的平台上自行编译,编译过程请参考[Paddle源码编译](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html)。 + + +## 4、部署 +- C++部署,先使用跨平台编译工具`CMake`根据`CMakeLists.txt`生成`Makefile`,支持`Windows、Linux、NV Jetson`平台,然后进行编译产出可执行文件。可以直接使用`cpp/scripts/build.sh`脚本编译: +```buildoutcfg +cd cpp +sh scripts/build.sh +``` + +- Python部署,可以使用使用`tools/infer.py`(以来PaddleDetection源码)部署,或者使用`deploy/python/infer.py`单文件部署 + +- PaddleServing部署请参考,[PaddleServing部署](./serving/README.md)部署。 + +- 手机移动端部署,请参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)部署。 + + +## 5、常见问题QA +- 1、`Paddle 1.8.4`训练的模型,可以用`Paddle2.0`部署吗? + Paddle 2.0是兼容Paddle 1.8.4的,因此是可以的。但是部分模型(如SOLOv2)使用到了Paddle 2.0中新增OP,这类模型不可以。 + +- 2、Windows编译时,预测库是VS2015编译的,选择VS2017或VS2019会有问题吗? + 关于VS兼容性问题请参考:[C++Visual Studio 2015、2017和2019之间的二进制兼容性](https://docs.microsoft.com/zh-cn/cpp/porting/binary-compat-2015-2017?view=msvc-160) + +- 3、cuDNN 8.0.4连续预测会发生内存泄漏吗? + 经QA测试,发现cuDNN 8系列连续预测时都有内存泄漏问题,且cuDNN 8性能差于cuDNN 7,推荐使用CUDA + cuDNN7.6.4的方式进行部署。 diff --git a/deploy/TENSOR_RT.md b/deploy/TENSOR_RT.md new file mode 100644 index 0000000..9d97cf2 --- /dev/null +++ b/deploy/TENSOR_RT.md @@ -0,0 +1,93 @@ +# TensorRT预测部署教程 +TensorRT是NVIDIA提出的用于统一模型部署的加速库,可以应用于V100、JETSON Xavier等硬件,它可以极大提高预测速度。Paddle TensorRT教程请参考文档[使用Paddle-TensorRT库预测](https://paddle-inference.readthedocs.io/en/latest/optimize/paddle_trt.html#) + +## 1. 安装PaddleInference预测库 +- Python安装包,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release) 下载带有tensorrt的安装包进行安装 + +- CPP预测库,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 下载带有TensorRT编译的预测库 + +- 如果Python和CPP官网没有提供已编译好的安装包或预测库,请参考[源码安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html) 自行编译 + +注意,您的机器上TensorRT的版本需要跟您使用的预测库中TensorRT版本保持一致。 + +## 2. 导出模型 +模型导出具体请参考文档[PaddleDetection模型导出教程](../EXPORT_MODEL.md)。 + +## 3. 开启TensorRT加速 +### 3.1 配置TensorRT +在使用Paddle预测库构建预测器配置config时,打开TensorRT引擎就可以了: + +``` +config->EnableUseGpu(100, 0); // 初始化100M显存,使用GPU ID为0 +config->GpuDeviceId(); // 返回正在使用的GPU ID +// 开启TensorRT预测,可提升GPU预测性能,需要使用带TensorRT的预测库 +config->EnableTensorRtEngine(1 << 20 /*workspace_size*/, + batch_size /*max_batch_size*/, + 3 /*min_subgraph_size*/, + AnalysisConfig::Precision::kFloat32 /*precision*/, + false /*use_static*/, + false /*use_calib_mode*/); + +``` + +### 3.2 TensorRT固定尺寸预测 +TensorRT版本<=5时,使用TensorRT预测时,只支持固定尺寸输入。 + +在导出模型时指定模型输入尺寸,设置`TestReader.inputs_def.image_shape=[3,640,640]`,具体请参考[PaddleDetection模型导出教程](../EXPORT_MODEL.md) 。 + +`TestReader.inputs_def.image_shape`设置的是输入TensorRT引擎的数据尺寸(在像FasterRCNN中,`TestReader.inputs_def.image_shape`指定的是在`Pad`操作之前的图像数据尺寸)。 + +可以通过[visualdl](https://www.paddlepaddle.org.cn/paddle/visualdl/demo/graph) 打开`model.pdmodel`文件,查看输入的第一个Tensor尺寸是否是固定的,如果不指定,尺寸会用`?`表示,如下图所示: +![img](imgs/input_shape.png) + +同时需要将图像预处理后的尺寸与设置车模型输入尺寸保持一致,需要设置`infer_cfg.yml`配置文件中`Resize OP`的`target_size`参数和`keep_ratio`参数。 + +注意:由于TesnorRT不支持在batch维度进行slice操作,Faster RCNN 和 Mask RCNN使用固定尺寸输入预测会报错,这两个模型请使用动态尺寸输入。 + +以`YOLOv3`为例,使用动态尺寸输入预测: +``` +python python/infer.py --model_dir=../inference_model/yolov3_darknet53_270e_coco/ --image_file=../demo/000000014439_640x640.jpg --use_gpu=True --run_mode=trt_fp32 --run_benchmark=True +``` + +### 3.3 TensorRT动态尺寸预测 + +TensorRT版本>=6时,使用TensorRT预测时,可以支持动态尺寸输入。 +Paddle预测库关于动态尺寸输入请查看[Paddle CPP预测](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/native_infer.html) 的`SetTRTDynamicShapeInfo`函数说明。 + +`python/infer.py`设置动态尺寸输入参数说明: + +- use_dynamic_shape 用于设定TensorRT的输入尺寸是否是动态尺寸,默认值:False + +- trt_min_shape 用于设定TensorRT的输入图像height、width中的最小尺寸,默认值:1 + +- trt_max_shape 用于设定TensorRT的输入图像height、width中的最大尺寸,默认值:1280 + +- trt_opt_shape 用于设定TensorRT的输入图像height、width中的最优尺寸,默认值:640 + +**注意:`TensorRT`中动态尺寸设置是4维的,这里只设置输入图像的尺寸。** + +以`Faster RCNN`为例,使用动态尺寸输入预测: +``` +python python/infer.py --model_dir=../inference_model/faster_rcnn_r50_fpn_1x_coco/ --image_file=../demo/000000014439.jpg --use_gpu=True --run_mode=trt_fp16 --run_benchmark=True --use_dynamic_shape=True --trt_max_shape=1280 --trt_min_shape=800 --trt_opt_shape=960 +``` + +## 4、常见问题QA +**Q:** 提示没有`tensorrt_op`
+**A:** 请检查是否使用带有TensorRT的Paddle Python包或预测库。 + +**Q:** 提示`op out of memory`
+**A:** 检查GPU是否是别人也在使用,请尝试使用空闲GPU + +**Q:** 提示`some trt inputs dynamic shape info not set`
+**A:** 这是由于`TensorRT`会把网络结果划分成多个子图,我们只设置了输入数据的动态尺寸,划分的其他子图的输入并未设置动态尺寸。有两个解决方法: + +- 方法一:通过增大`min_subgraph_size`,跳过对这些子图的优化。根据提示,设置min_subgraph_size大于并未设置动态尺寸输入的子图中OP个数即可。 +`min_subgraph_size`的意思是,在加载TensorRT引擎的时候,大于`min_subgraph_size`的OP才会被优化,并且这些OP是连续的且是TensorRT可以优化的。 + +- 方法二:找到子图的这些输入,按照上面方式也设置子图的输入动态尺寸。 + +**Q:** 如何打开日志
+**A:** 预测库默认是打开日志的,只要注释掉`config.disable_glog_info()`就可以打开日志 + +**Q:** 开启TensorRT,预测时提示Slice on batch axis is not supported in TensorRT
+**A:** 请尝试使用动态尺寸输入 diff --git a/deploy/cpp/CMakeLists.txt b/deploy/cpp/CMakeLists.txt new file mode 100644 index 0000000..0bc0be9 --- /dev/null +++ b/deploy/cpp/CMakeLists.txt @@ -0,0 +1,241 @@ +cmake_minimum_required(VERSION 3.0) +project(PaddleObjectDetector CXX C) + +option(WITH_MKL "Compile demo with MKL/OpenBlas support,defaultuseMKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." ON) +option(WITH_TENSORRT "Compile demo with TensorRT." OFF) + + +SET(PADDLE_DIR "" CACHE PATH "Location of libraries") +SET(PADDLE_LIB_NAME "" CACHE STRING "libpaddle_inference") +SET(OPENCV_DIR "" CACHE PATH "Location of libraries") +SET(CUDA_LIB "" CACHE PATH "Location of libraries") +SET(CUDNN_LIB "" CACHE PATH "Location of libraries") +SET(TENSORRT_INC_DIR "" CACHE PATH "Compile demo with TensorRT") +SET(TENSORRT_LIB_DIR "" CACHE PATH "Compile demo with TensorRT") + +include(cmake/yaml-cpp.cmake) + +include_directories("${CMAKE_SOURCE_DIR}/") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/src/ext-yaml-cpp/include") +link_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/lib") + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +if (WITH_MKL) + ADD_DEFINITIONS(-DUSE_MKL) +endif() + +if (NOT DEFINED PADDLE_DIR OR ${PADDLE_DIR} STREQUAL "") + message(FATAL_ERROR "please set PADDLE_DIR with -DPADDLE_DIR=/path/paddle_influence_dir") +endif() +message("PADDLE_DIR IS:"${PADDLE_DIR}) + +if (NOT DEFINED OPENCV_DIR OR ${OPENCV_DIR} STREQUAL "") + message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv") +endif() + +include_directories("${CMAKE_SOURCE_DIR}/") +include_directories("${PADDLE_DIR}/") +include_directories("${PADDLE_DIR}/third_party/install/protobuf/include") +include_directories("${PADDLE_DIR}/third_party/install/glog/include") +include_directories("${PADDLE_DIR}/third_party/install/gflags/include") +include_directories("${PADDLE_DIR}/third_party/install/xxhash/include") +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/include") + include_directories("${PADDLE_DIR}/third_party/install/snappy/include") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/include") + include_directories("${PADDLE_DIR}/third_party/install/snappystream/include") +endif() +include_directories("${PADDLE_DIR}/third_party/boost") +include_directories("${PADDLE_DIR}/third_party/eigen3") + +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappy/lib") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappystream/lib") +endif() + +link_directories("${PADDLE_DIR}/third_party/install/protobuf/lib") +link_directories("${PADDLE_DIR}/third_party/install/glog/lib") +link_directories("${PADDLE_DIR}/third_party/install/gflags/lib") +link_directories("${PADDLE_DIR}/third_party/install/xxhash/lib") +link_directories("${PADDLE_DIR}/paddle/lib/") +link_directories("${CMAKE_CURRENT_BINARY_DIR}") + + + +if (WIN32) + include_directories("${PADDLE_DIR}/paddle/fluid/inference") + include_directories("${PADDLE_DIR}/paddle/include") + link_directories("${PADDLE_DIR}/paddle/fluid/inference") + find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH) + +else () + find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/share/OpenCV NO_DEFAULT_PATH) + include_directories("${PADDLE_DIR}/paddle/include") + link_directories("${PADDLE_DIR}/paddle/lib") +endif () +include_directories(${OpenCV_INCLUDE_DIRS}) + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -o2 -fopenmp -std=c++11") + set(CMAKE_STATIC_LIBRARY_PREFIX "") +endif() + +# TODO let users define cuda lib path +if (WITH_GPU) + if (NOT DEFINED CUDA_LIB OR ${CUDA_LIB} STREQUAL "") + message(FATAL_ERROR "please set CUDA_LIB with -DCUDA_LIB=/path/cuda-8.0/lib64") + endif() + if (NOT WIN32) + if (NOT DEFINED CUDNN_LIB) + message(FATAL_ERROR "please set CUDNN_LIB with -DCUDNN_LIB=/path/cudnn_v7.4/cuda/lib64") + endif() + endif(NOT WIN32) +endif() + + +if (NOT WIN32) + if (WITH_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_INC_DIR}/") + link_directories("${TENSORRT_LIB_DIR}/") + endif() +endif(NOT WIN32) + +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_DIR}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_MKL) + include_directories("${PADDLE_DIR}/third_party/install/mklml/include") + if (WIN32) + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.lib + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.lib) + else () + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + execute_process(COMMAND cp -r ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} /usr/lib) + endif () + set(MKLDNN_PATH "${PADDLE_DIR}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + if (WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else () + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif () + endif() +else() + set(MATH_LIB ${PADDLE_DIR}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + + +if (WIN32) + if(EXISTS "${PADDLE_DIR}/paddle/fluid/inference/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(DEPS + ${PADDLE_DIR}/paddle/fluid/inference/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(DEPS + ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + + +if (WIN32) + set(DEPS ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(DEPS ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + +message("PADDLE_LIB_NAME:" ${PADDLE_LIB_NAME}) +message("DEPS:" $DEPS) + +if (NOT WIN32) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf z xxhash yaml-cpp + ) + if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags_static libprotobuf xxhash libyaml-cppmt) + set(DEPS ${DEPS} libcmt shlwapi) + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() + if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (WITH_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDNN_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-ldl -lrt -lgomp -lz -lm -lpthread") + set(DEPS ${DEPS} ${EXTERNAL_LIB}) +endif() + +set(DEPS ${DEPS} ${OpenCV_LIBS}) +add_executable(main src/main.cc src/preprocess_op.cc src/object_detector.cc) +ADD_DEPENDENCIES(main ext-yaml-cpp) +message("DEPS:" $DEPS) +target_link_libraries(main ${DEPS}) + +if (WIN32 AND WITH_MKL) + add_custom_command(TARGET main POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./release/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}.dll ./release/${PADDLE_LIB_NAME}.dll + ) +endif() + +if (WIN32) + add_custom_command(TARGET main POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/paddle/lib/${PADDLE_LIB_NAME}.dll ./release/${PADDLE_LIB_NAME}.dll + ) +endif() diff --git a/deploy/cpp/README.md b/deploy/cpp/README.md new file mode 100644 index 0000000..c9962bd --- /dev/null +++ b/deploy/cpp/README.md @@ -0,0 +1,72 @@ +# C++端预测部署 + +## 本教程结构 + +[1.说明](#1说明) + +[2.主要目录和文件](#2主要目录和文件) + +[3.编译部署](#3编译) + + + +## 1.说明 + +本目录为用户提供一个跨平台的`C++`部署方案,让用户通过`PaddleDetection`训练的模型导出后,即可基于本项目快速运行,也可以快速集成代码结合到自己的项目实际应用中去。 + +主要设计的目标包括以下四点: +- 跨平台,支持在 `Windows` 和 `Linux` 完成编译、二次开发集成和部署运行 +- 可扩展性,支持用户针对新模型开发自己特殊的数据预处理等逻辑 +- 高性能,除了`PaddlePaddle`自身带来的性能优势,我们还针对图像检测的特点对关键步骤进行了性能优化 +- 支持各种不同检测模型结构,包括`Yolov3`/`Faster_RCNN`/`SSD`等 + +## 2.主要目录和文件 + +```bash +deploy/cpp +| +├── src +│ ├── main.cc # 集成代码示例, 程序入口 +│ ├── object_detector.cc # 模型加载和预测主要逻辑封装类实现 +│ └── preprocess_op.cc # 预处理相关主要逻辑封装实现 +| +├── include +│ ├── config_parser.h # 导出模型配置yaml文件解析 +│ ├── object_detector.h # 模型加载和预测主要逻辑封装类 +│ └── preprocess_op.h # 预处理相关主要逻辑类封装 +| +├── docs +│ ├── linux_build.md # Linux 编译指南 +│ └── windows_vs2019_build.md # Windows VS2019编译指南 +│ +├── build.sh # 编译命令脚本 +│ +├── CMakeList.txt # cmake编译入口文件 +| +├── CMakeSettings.json # Visual Studio 2019 CMake项目编译设置 +│ +└── cmake # 依赖的外部项目cmake(目前仅有yaml-cpp) + +``` + +## 3.编译部署 + +### 3.1 导出模型 +请确认您已经基于`PaddleDetection`的[export_model.py](https://github.com/PaddlePaddle/PaddleDetection/blob/dygraph/tools/export_model.py)导出您的模型,并妥善保存到合适的位置。导出模型细节请参考 [导出模型教程](https://github.com/PaddlePaddle/PaddleDetection/tree/dygraph/deploy/EXPORT_MODEL.md)。 + +模型导出后, 目录结构如下(以`yolov3_darknet`为例): +``` +yolov3_darknet # 模型目录 +├── infer_cfg.yml # 模型配置信息 +├── model.pdmodel # 模型文件 +├── model.pdiparams.info #模型公用信息 +└── model.pdiparams # 参数文件 +``` + +预测时,该目录所在的路径会作为程序的输入参数。 + +### 3.2 编译 + +仅支持在`Windows`和`Linux`平台编译和使用 +- [Linux 编译指南](docs/linux_build.md) +- [Windows编译指南(使用Visual Studio 2019)](docs/windows_vs2019_build.md) diff --git a/deploy/cpp/cmake/yaml-cpp.cmake b/deploy/cpp/cmake/yaml-cpp.cmake new file mode 100644 index 0000000..7bc7f34 --- /dev/null +++ b/deploy/cpp/cmake/yaml-cpp.cmake @@ -0,0 +1,30 @@ + +find_package(Git REQUIRED) + +include(ExternalProject) + +message("${CMAKE_BUILD_TYPE}") + +ExternalProject_Add( + ext-yaml-cpp + URL https://bj.bcebos.com/paddlex/deploy/deps/yaml-cpp.zip + URL_MD5 9542d6de397d1fbd649ed468cb5850e6 + CMAKE_ARGS + -DYAML_CPP_BUILD_TESTS=OFF + -DYAML_CPP_BUILD_TOOLS=OFF + -DYAML_CPP_INSTALL=OFF + -DYAML_CPP_BUILD_CONTRIB=OFF + -DMSVC_SHARED_RT=OFF + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib + -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib + PREFIX "${CMAKE_BINARY_DIR}/ext/yaml-cpp" + # Disable install step + INSTALL_COMMAND "" + LOG_DOWNLOAD ON + LOG_BUILD 1 +) diff --git a/deploy/cpp/docs/Jetson_build.md b/deploy/cpp/docs/Jetson_build.md new file mode 100644 index 0000000..d7ece30 --- /dev/null +++ b/deploy/cpp/docs/Jetson_build.md @@ -0,0 +1,188 @@ +# Jetson平台编译指南 + +## 说明 +`NVIDIA Jetson`设备是具有`NVIDIA GPU`的嵌入式设备,可以将目标检测算法部署到该设备上。本文档是在`Jetson`硬件上部署`PaddleDetection`模型的教程。 + +本文档以`Jetson TX2`硬件、`JetPack 4.3`版本为例进行说明。 + +`Jetson`平台的开发指南请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html). + +## Jetson环境搭建 +`Jetson`系统软件安装,请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html). + +* (1) 查看硬件系统的l4t的版本号 +``` +cat /etc/nv_tegra_release +``` +* (2) 根据硬件,选择硬件可安装的`JetPack`版本,硬件和`JetPack`版本对应关系请参考[jetpack-archive](https://developer.nvidia.com/embedded/jetpack-archive). + +* (3) 下载`JetPack`,请参考[NVIDIA Jetson Linux Developer Guide](https://docs.nvidia.com/jetson/l4t/index.html) 中的`Preparing a Jetson Developer Kit for Use`章节内容进行刷写系统镜像。 + +**注意**: 请在[jetpack-archive](https://developer.nvidia.com/embedded/jetpack-archive) 根据硬件选择适配的`JetPack`版本进行刷机。 + +## 下载或编译`Paddle`预测库 +本文档使用`Paddle`在`JetPack4.3`上预先编译好的预测库,请根据硬件在[安装与编译 Linux 预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 中选择对应版本的`Paddle`预测库。 + +这里选择[nv_jetson_cuda10_cudnn7.6_trt6(jetpack4.3)](https://paddle-inference-lib.bj.bcebos.com/2.0.0-nv-jetson-jetpack4.3-all/paddle_inference.tgz), `Paddle`版本`2.0.0-rc0`,`CUDA`版本`10.0`,`CUDNN`版本`7.6`,`TensorRT`版本`6`。 + +若需要自己在`Jetson`平台上自定义编译`Paddle`库,请参考文档[安装与编译 Linux 预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html) 的`NVIDIA Jetson嵌入式硬件预测库源码编译`部分内容。 + +### Step1: 下载代码 + + `git clone https://github.com/PaddlePaddle/PaddleDetection.git` + +**说明**:其中`C++`预测代码在`/root/projects/PaddleDetection/deploy/cpp` 目录,该目录不依赖任何`PaddleDetection`下其他目录。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +解压下载的[nv_jetson_cuda10_cudnn7.6_trt6(jetpack4.3)](https://paddle-inference-lib.bj.bcebos.com/2.0.1-nv-jetson-jetpack4.3-all/paddle_inference.tgz) 。 + +下载并解压后`/root/projects/fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +**注意:** 预编译库`nv-jetson-cuda10-cudnn7.6-trt6`使用的`GCC`版本是`7.5.0`,其他都是使用`GCC 4.8.5`编译的。使用高版本的GCC可能存在`ABI`兼容性问题,建议降级或[自行编译预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。 + + +### Step4: 编译 + +编译`cmake`的命令在`scripts/build.sh`中,请根据实际情况修改主要参数,其主要内容说明如下: + +注意,`TX2`平台的`CUDA`、`CUDNN`需要通过`JetPack`安装。 + +``` +# 是否使用GPU(即是否使用 CUDA) +WITH_GPU=ON + +# 是否使用MKL or openblas,TX2需要设置为OFF +WITH_MKL=OFF + +# 是否集成 TensorRT(仅WITH_GPU=ON 有效) +WITH_TENSORRT=ON + +# TensorRT 的include路径 +TENSORRT_INC_DIR=/usr/include/aarch64-linux-gnu + +# TensorRT 的lib路径 +TENSORRT_LIB_DIR=/usr/lib/aarch64-linux-gnu + +# Paddle 预测库路径 +PADDLE_DIR=/path/to/fluid_inference/ + +# Paddle 预测库名称 +PADDLE_LIB_NAME=paddle_inference + +# Paddle 的预测库是否使用静态库来编译 +# 使用TensorRT时,Paddle的预测库通常为动态库 +WITH_STATIC_LIB=OFF + +# CUDA 的 lib 路径 +CUDA_LIB=/usr/local/cuda-10.0/lib64 + +# CUDNN 的 lib 路径 +CUDNN_LIB=/usr/lib/aarch64-linux-gnu + +# OPENCV_DIR 的路径 +# linux平台请下载:https://bj.bcebos.com/paddleseg/deploy/opencv3.4.6gcc4.8ffmpeg.tar.gz2,并解压到deps文件夹下 +# TX2平台请下载:https://paddlemodels.bj.bcebos.com/TX2_JetPack4.3_opencv_3.4.10_gcc7.5.0.zip,并解压到deps文件夹下 +OPENCV_DIR=/path/to/opencv + +# 请检查以上各个路径是否正确 + +# 以下无需改动 +cmake .. \ + -DWITH_GPU=${WITH_GPU} \ + -DWITH_MKL=OFF \ + -DWITH_TENSORRT=${WITH_TENSORRT} \ + -DTENSORRT_DIR=${TENSORRT_DIR} \ + -DPADDLE_DIR=${PADDLE_DIR} \ + -DWITH_STATIC_LIB=${WITH_STATIC_LIB} \ + -DCUDA_LIB=${CUDA_LIB} \ + -DCUDNN_LIB=${CUDNN_LIB} \ + -DOPENCV_DIR=${OPENCV_DIR} \ + -DPADDLE_LIB_NAME={PADDLE_LIB_NAME} +make +``` + +例如设置如下: +``` +# 是否使用GPU(即是否使用 CUDA) +WITH_GPU=ON + +# 是否使用MKL or openblas +WITH_MKL=OFF + +# 是否集成 TensorRT(仅WITH_GPU=ON 有效) +WITH_TENSORRT=OFF + +# TensorRT 的include路径 +TENSORRT_INC_DIR=/usr/include/aarch64-linux-gnu + +# TensorRT 的lib路径 +TENSORRT_LIB_DIR=/usr/lib/aarch64-linux-gnu + +# Paddle 预测库路径 +PADDLE_DIR=/home/nvidia/PaddleDetection_infer/fluid_inference/ + +# Paddle 预测库名称 +PADDLE_LIB_NAME=paddle_inference + +# Paddle 的预测库是否使用静态库来编译 +# 使用TensorRT时,Paddle的预测库通常为动态库 +WITH_STATIC_LIB=OFF + +# CUDA 的 lib 路径 +CUDA_LIB=/usr/local/cuda-10.0/lib64 + +# CUDNN 的 lib 路径 +CUDNN_LIB=/usr/lib/aarch64-linux-gnu/ +``` + +修改脚本设置好主要参数后,执行`build`脚本: + ```shell + sh ./scripts/build.sh + ``` + +### Step5: 预测及可视化 +编译成功后,预测入口程序为`build/main`其主要命令参数说明如下: +| 参数 | 说明 | +| ---- | ---- | +| --model_dir | 导出的预测模型所在路径 | +| --image_path | 要预测的图片文件路径 | +| --video_path | 要预测的视频文件路径 | +| --camera_id | Option | 用来预测的摄像头ID,默认为-1(表示不使用摄像头预测)| +| --use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0)| +| --gpu_id | 指定进行推理的GPU device id(默认值为0)| +| --run_mode | 使用GPU时,默认为fluid, 可选(fluid/trt_fp32/trt_fp16/trt_int8)| +| --run_benchmark | 是否重复预测来进行benchmark测速 | +| --output_dir | 输出图片所在的文件夹, 默认为output | + +**注意**: 如果同时设置了`video_path`和`image_path`,程序仅预测`video_path`。 + + +`样例一`: +```shell +#不使用`GPU`测试图片 `/root/projects/images/test.jpeg` +./main --model_dir=/root/projects/models/yolov3_darknet --image_path=/root/projects/images/test.jpeg +``` + +图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。 + + +`样例二`: +```shell +#使用 `GPU`预测视频`/root/projects/videos/test.mp4` +./main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1 +``` +视频文件目前支持`.mp4`格式的预测,`可视化预测结果`会保存在当前目录下`output.mp4`文件中。 + + +## 性能测试 +benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md) diff --git a/deploy/cpp/docs/linux_build.md b/deploy/cpp/docs/linux_build.md new file mode 100644 index 0000000..76b9619 --- /dev/null +++ b/deploy/cpp/docs/linux_build.md @@ -0,0 +1,129 @@ +# Linux平台编译指南 + +## 说明 +本文档在 `Linux`平台使用`GCC 8.2`测试过,如果需要使用其他G++版本编译使用,则需要重新编译Paddle预测库,请参考: [从源码编译Paddle预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。本文档使用的预置的opencv库是在ubuntu 16.04上用gcc4.8编译的,如果需要在ubuntu 16.04以外的系统环境编译,那么需自行编译opencv库。 + +## 前置条件 +* G++ 8.2 +* CUDA 9.0 / CUDA 10.0, cudnn 7+ (仅在使用GPU版本的预测库时需要) +* CMake 3.0+ + +请确保系统已经安装好上述基本软件,**下面所有示例以工作目录为 `/root/projects/`演示**。 + +### Step1: 下载代码 + + `git clone https://github.com/PaddlePaddle/PaddleDetection.git` + +**说明**:其中`C++`预测代码在`/root/projects/PaddleDetection/deploy/cpp` 目录,该目录不依赖任何`PaddleDetection`下其他目录。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本,请根据实际情况下载: [C++预测库下载列表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) + + +下载并解压后`/root/projects/fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +**注意:** 预编译版本除`nv-jetson-cuda10-cudnn7.5-trt5` 以外其它包都是基于`GCC 4.8.5`编译,使用高版本`GCC`可能存在 `ABI`兼容性问题,建议降级或[自行编译预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.html)。 + + +### Step4: 编译 + +编译`cmake`的命令在`scripts/build.sh`中,请根据实际情况修改主要参数,其主要内容说明如下: + +``` +# 是否使用GPU(即是否使用 CUDA) +WITH_GPU=OFF + +# 使用MKL or openblas +WITH_MKL=ON + +# 是否集成 TensorRT(仅WITH_GPU=ON 有效) +WITH_TENSORRT=OFF + +# TensorRT 的include路径 +TENSORRT_LIB_DIR=/path/to/TensorRT/include + +# TensorRT 的lib路径 +TENSORRT_LIB_DIR=/path/to/TensorRT/lib + +# Paddle 预测库路径 +PADDLE_DIR=/path/to/fluid_inference + +# Paddle 预测库名称 +PADDLE_LIB_NAME=paddle_inference + +# CUDA 的 lib 路径 +CUDA_LIB=/path/to/cuda/lib + +# CUDNN 的 lib 路径 +CUDNN_LIB=/path/to/cudnn/lib + +# 请检查以上各个路径是否正确 + +# 以下无需改动 +cmake .. \ + -DWITH_GPU=${WITH_GPU} \ + -DWITH_MKL=${WITH_MKL} \ + -DWITH_TENSORRT=${WITH_TENSORRT} \ + -DTENSORRT_LIB_DIR=${TENSORRT_LIB_DIR} \ + -DTENSORRT_INC_DIR=${TENSORRT_INC_DIR} \ + -DPADDLE_DIR=${PADDLE_DIR} \ + -DCUDA_LIB=${CUDA_LIB} \ + -DCUDNN_LIB=${CUDNN_LIB} \ + -DOPENCV_DIR=${OPENCV_DIR} \ + -DPADDLE_LIB_NAME={PADDLE_LIB_NAME} +make + +``` + +修改脚本设置好主要参数后,执行`build`脚本: + ```shell + sh ./scripts/build.sh + ``` + +**注意**: OPENCV依赖OPENBLAS,Ubuntu用户需确认系统是否已存在`libopenblas.so`。如未安装,可执行apt-get install libopenblas-dev进行安装。 + +### Step5: 预测及可视化 +编译成功后,预测入口程序为`build/main`其主要命令参数说明如下: +| 参数 | 说明 | +| ---- | ---- | +| --model_dir | 导出的预测模型所在路径 | +| --image_path | 要预测的图片文件路径 | +| --video_path | 要预测的视频文件路径 | +| --camera_id | Option | 用来预测的摄像头ID,默认为-1(表示不使用摄像头预测)| +| --use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0)| +| --gpu_id | 指定进行推理的GPU device id(默认值为0)| +| --run_mode | 使用GPU时,默认为fluid, 可选(fluid/trt_fp32/trt_fp16/trt_int8)| +| --run_benchmark | 是否重复预测来进行benchmark测速 | +| --output_dir | 输出图片所在的文件夹, 默认为output | + +**注意**: 如果同时设置了`video_path`和`image_path`,程序仅预测`video_path`。 + + +`样例一`: +```shell +#不使用`GPU`测试图片 `/root/projects/images/test.jpeg` +./build/main --model_dir=/root/projects/models/yolov3_darknet --image_path=/root/projects/images/test.jpeg +``` + +图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。 + + +`样例二`: +```shell +#使用 `GPU`预测视频`/root/projects/videos/test.mp4` +./build/main --model_dir=/root/projects/models/yolov3_darknet --video_path=/root/projects/images/test.mp4 --use_gpu=1 +``` +视频文件目前支持`.mp4`格式的预测,`可视化预测结果`会保存在当前目录下`output.mp4`文件中。 + +## 性能测试 +benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md) diff --git a/deploy/cpp/docs/windows_vs2019_build.md b/deploy/cpp/docs/windows_vs2019_build.md new file mode 100644 index 0000000..34607b2 --- /dev/null +++ b/deploy/cpp/docs/windows_vs2019_build.md @@ -0,0 +1,128 @@ +# Visual Studio 2019 Community CMake 编译指南 + +Windows 平台下,我们使用`Visual Studio 2019 Community` 进行了测试。微软从`Visual Studio 2017`开始即支持直接管理`CMake`跨平台编译项目,但是直到`2019`才提供了稳定和完全的支持,所以如果你想使用CMake管理项目编译构建,我们推荐你使用`Visual Studio 2019`环境下构建。 + + +## 前置条件 +* Visual Studio 2019 (根据Paddle预测库所使用的VS版本选择,请参考 [Visual Studio 不同版本二进制兼容性](https://docs.microsoft.com/zh-cn/cpp/porting/binary-compat-2015-2017?view=vs-2019) ) +* CUDA 9.0 / CUDA 10.0,cudnn 7+ (仅在使用GPU版本的预测库时需要) +* CMake 3.0+ [CMake下载](https://cmake.org/download/) + +请确保系统已经安装好上述基本软件,我们使用的是`VS2019`的社区版。 + +**下面所有示例以工作目录为 `D:\projects`演示**。 + +### Step1: 下载代码 + +下载源代码 +```shell +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` + +**说明**:其中`C++`预测代码在`PaddleDetection/deploy/cpp` 目录,该目录不依赖任何`PaddleDetection`下其他目录。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本,请根据实际情况下载: [C++预测库下载列表](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/windows_cpp_inference.html) + +解压后`D:\projects\fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +### Step3: 安装配置OpenCV + +1. 在OpenCV官网下载适用于Windows平台的3.4.6版本, [下载地址](https://sourceforge.net/projects/opencvlibrary/files/3.4.6/opencv-3.4.6-vc14_vc15.exe/download) +2. 运行下载的可执行文件,将OpenCV解压至指定目录,如`D:\projects\opencv` +3. 配置环境变量,如下流程所示(如果使用全局绝对路径,可以不用设置环境变量) + - 我的电脑->属性->高级系统设置->环境变量 + - 在系统变量中找到Path(如没有,自行创建),并双击编辑 + - 新建,将opencv路径填入并保存,如`D:\projects\opencv\build\x64\vc14\bin` + +### Step4: 编译 + +1. 进入到`cpp`文件夹 +``` +cd D:\projects\PaddleDetection\deploy\cpp +``` + +2. 使用CMake生成项目文件 + +编译参数的含义说明如下(带*表示仅在使用**GPU版本**预测库时指定, 其中CUDA库版本尽量对齐,**使用9.0、10.0版本,不使用9.2、10.1等版本CUDA库**): + +| 参数名 | 含义 | +| ---- | ---- | +| *CUDA_LIB | CUDA的库路径 | +| *CUDNN_LIB | CUDNN的库路径 | +| OPENCV_DIR | OpenCV的安装路径, | +| PADDLE_DIR | Paddle预测库的路径 | +| PADDLE_LIB_NAME | Paddle 预测库名称 | + +**注意:** 1. 使用`CPU`版预测库,请把`WITH_GPU`的勾去掉 2. 如果使用的是`openblas`版本,请把`WITH_MKL`勾去掉 + +执行如下命令项目文件: +``` +cmake . -G "Visual Studio 16 2019" -A x64 -T host=x64 -DWITH_GPU=ON -DWITH_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCUDA_LIB=path_to_cuda_lib -DCUDNN_LIB=path_to_cudnn_lib -DPADDLE_DIR=path_to_paddle_lib -DPADDLE_LIB_NAME=paddle_inference -DOPENCV_DIR=path_to_opencv +``` + +例如: +``` +cmake . -G "Visual Studio 16 2019" -A x64 -T host=x64 -DWITH_GPU=ON -DWITH_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCUDA_LIB=D:\projects\packages\cuda10_0\lib\x64 -DCUDNN_LIB=D:\projects\packages\cuda10_0\lib\x64 -DPADDLE_DIR=D:\projects\packages\fluid_inference -DPADDLE_LIB_NAME=paddle_inference -DOPENCV_DIR=D:\projects\packages\opencv3_4_6 +``` + +3. 编译 +用`Visual Studio 16 2019`打开`cpp`文件夹下的`PaddleObjectDetector.sln`,将编译模式设置为`Release`,点击`生成`->`全部生成 + + +### Step5: 预测及可视化 + +上述`Visual Studio 2019`编译产出的可执行文件在`out\build\x64-Release`目录下,打开`cmd`,并切换到该目录: + +``` +cd D:\projects\PaddleDetection\deploy\cpp\out\build\x64-Release +``` +可执行文件`main`即为样例的预测程序,其主要的命令行参数如下: + +| 参数 | 说明 | +| ---- | ---- | +| --model_dir | 导出的预测模型所在路径 | +| --image_path | 要预测的图片文件路径 | +| --video_path | 要预测的视频文件路径 | +| --camera_id | Option | 用来预测的摄像头ID,默认为-1(表示不使用摄像头预测)| +| --use_gpu | 是否使用 GPU 预测, 支持值为0或1(默认值为0)| +| --gpu_id | 指定进行推理的GPU device id(默认值为0)| +| --run_mode | 使用GPU时,默认为fluid, 可选(fluid/trt_fp32/trt_fp16/trt_int8)| +| --run_benchmark | 是否重复预测来进行benchmark测速 | +| --output_dir | 输出图片所在的文件夹, 默认为output | + +**注意**: +(1)如果同时设置了`video_path`和`image_path`,程序仅预测`video_path`。 +(2)如果提示找不到`opencv_world346.dll`,把`D:\projects\packages\opencv3_4_6\build\x64\vc14\bin`文件夹下的`opencv_world346.dll`拷贝到`main.exe`文件夹下即可。 + + +`样例一`: +```shell +#不使用`GPU`测试图片 `D:\\images\\test.jpeg` +.\main --model_dir=D:\\models\\yolov3_darknet --image_path=D:\\images\\test.jpeg +``` + +图片文件`可视化预测结果`会保存在当前目录下`output.jpg`文件中。 + + +`样例二`: +```shell +#使用`GPU`测试视频 `D:\\videos\\test.mp4` +.\main --model_dir=D:\\models\\yolov3_darknet --video_path=D:\\videos\\test.mp4 --use_gpu=1 +``` + +视频文件目前支持`.mp4`格式的预测,`可视化预测结果`会保存在当前目录下`output.mp4`文件中。 + + +## 性能测试 +benchmark请查看[BENCHMARK_INFER](../../BENCHMARK_INFER.md) diff --git a/deploy/cpp/include/config_parser.h b/deploy/cpp/include/config_parser.h new file mode 100644 index 0000000..c38049d --- /dev/null +++ b/deploy/cpp/include/config_parser.h @@ -0,0 +1,113 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "yaml-cpp/yaml.h" + +#ifdef _WIN32 +#define OS_PATH_SEP "\\" +#else +#define OS_PATH_SEP "/" +#endif + +namespace PaddleDetection { + +// Inference model configuration parser +class ConfigPaser { + public: + ConfigPaser() {} + + ~ConfigPaser() {} + + bool load_config(const std::string& model_dir, + const std::string& cfg = "infer_cfg.yml") { + // Load as a YAML::Node + YAML::Node config; + config = YAML::LoadFile(model_dir + OS_PATH_SEP + cfg); + + // Get runtime mode : fluid, trt_fp16, trt_fp32 + if (config["mode"].IsDefined()) { + mode_ = config["mode"].as(); + } else { + std::cerr << "Please set mode, " + << "support value : fluid/trt_fp16/trt_fp32." + << std::endl; + return false; + } + + // Get model arch : YOLO, SSD, RetinaNet, RCNN, Face + if (config["arch"].IsDefined()) { + arch_ = config["arch"].as(); + } else { + std::cerr << "Please set model arch," + << "support value : YOLO, SSD, RetinaNet, RCNN, Face." + << std::endl; + return false; + } + + // Get min_subgraph_size for tensorrt + if (config["min_subgraph_size"].IsDefined()) { + min_subgraph_size_ = config["min_subgraph_size"].as(); + } else { + std::cerr << "Please set min_subgraph_size." << std::endl; + return false; + } + // Get draw_threshold for visualization + if (config["draw_threshold"].IsDefined()) { + draw_threshold_ = config["draw_threshold"].as(); + } else { + std::cerr << "Please set draw_threshold." << std::endl; + return false; + } + // Get Preprocess for preprocessing + if (config["Preprocess"].IsDefined()) { + preprocess_info_ = config["Preprocess"]; + } else { + std::cerr << "Please set Preprocess." << std::endl; + return false; + } + // Get label_list for visualization + if (config["label_list"].IsDefined()) { + label_list_ = config["label_list"].as>(); + } else { + std::cerr << "Please set label_list." << std::endl; + return false; + } + + if (config["image_shape"].IsDefined()) { + image_shape_ = config["image_shape"].as>(); + } else { + std::cerr << "Please set image_shape." << std::endl; + return false; + } + + return true; + } + std::string mode_; + float draw_threshold_; + std::string arch_; + int min_subgraph_size_; + YAML::Node preprocess_info_; + std::vector label_list_; + std::vector image_shape_; +}; + +} // namespace PaddleDetection + diff --git a/deploy/cpp/include/object_detector.h b/deploy/cpp/include/object_detector.h new file mode 100644 index 0000000..4c1846a --- /dev/null +++ b/deploy/cpp/include/object_detector.h @@ -0,0 +1,118 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "paddle_inference_api.h" // NOLINT + +#include "include/preprocess_op.h" +#include "include/config_parser.h" + +using namespace paddle_infer; + +namespace PaddleDetection { +// Object Detection Result +struct ObjectResult { + // Rectangle coordinates of detected object: left, right, top, down + std::vector rect; + // Class id of detected object + int class_id; + // Confidence of detected object + float confidence; +}; + + +// Generate visualization colormap for each class +std::vector GenerateColorMap(int num_class); + + +// Visualiztion Detection Result +cv::Mat VisualizeResult(const cv::Mat& img, + const std::vector& results, + const std::vector& lable_list, + const std::vector& colormap); + + +class ObjectDetector { + public: + explicit ObjectDetector(const std::string& model_dir, + bool use_gpu=false, + const std::string& run_mode="fluid", + const int gpu_id=0, + bool use_dynamic_shape=false, + const int trt_min_shape=1, + const int trt_max_shape=1280, + const int trt_opt_shape=640) { + config_.load_config(model_dir); + threshold_ = config_.draw_threshold_; + image_shape_ = config_.image_shape_; + preprocessor_.Init(config_.preprocess_info_, image_shape_); + LoadModel(model_dir, use_gpu, config_.min_subgraph_size_, 1, run_mode, gpu_id, + use_dynamic_shape, trt_min_shape, trt_max_shape, trt_opt_shape); + } + + // Load Paddle inference model + void LoadModel( + const std::string& model_dir, + bool use_gpu, + const int min_subgraph_size, + const int batch_size = 1, + const std::string& run_mode = "fluid", + const int gpu_id=0, + bool use_dynamic_shape=false, + const int trt_min_shape=1, + const int trt_max_shape=1280, + const int trt_opt_shape=640); + + // Run predictor + void Predict(const cv::Mat& im, + const double threshold = 0.5, + const int warmup = 0, + const int repeats = 1, + const bool run_benchmark = false, + std::vector* result = nullptr); + + // Get Model Label list + const std::vector& GetLabelList() const { + return config_.label_list_; + } + + private: + // Preprocess image and copy data to input buffer + void Preprocess(const cv::Mat& image_mat); + // Postprocess result + void Postprocess( + const cv::Mat& raw_mat, + std::vector* result); + + std::shared_ptr predictor_; + Preprocessor preprocessor_; + ImageBlob inputs_; + std::vector output_data_; + float threshold_; + ConfigPaser config_; + std::vector image_shape_; +}; + +} // namespace PaddleDetection diff --git a/deploy/cpp/include/preprocess_op.h b/deploy/cpp/include/preprocess_op.h new file mode 100644 index 0000000..26a91cc --- /dev/null +++ b/deploy/cpp/include/preprocess_op.h @@ -0,0 +1,160 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace PaddleDetection { + +// Object for storing all preprocessed data +class ImageBlob { + public: + // image width and height + std::vector im_shape_; + // Buffer for image data after preprocessing + std::vector im_data_; + // in net data shape(after pad) + std::vector in_net_shape_; + // Evaluation image width and height + //std::vector eval_im_size_f_; + // Scale factor for image size to origin image size + std::vector scale_factor_; +}; + +// Abstraction of preprocessing opration class +class PreprocessOp { + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) = 0; + virtual void Run(cv::Mat* im, ImageBlob* data) = 0; +}; + +class InitInfo : public PreprocessOp{ + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) {} + virtual void Run(cv::Mat* im, ImageBlob* data); +}; + +class NormalizeImage : public PreprocessOp { + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) { + mean_ = item["mean"].as>(); + scale_ = item["std"].as>(); + is_scale_ = item["is_scale"].as(); + } + + virtual void Run(cv::Mat* im, ImageBlob* data); + + private: + // CHW or HWC + std::vector mean_; + std::vector scale_; + bool is_scale_; +}; + +class Permute : public PreprocessOp { + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) {} + virtual void Run(cv::Mat* im, ImageBlob* data); + +}; + +class Resize : public PreprocessOp { + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) { + interp_ = item["interp"].as(); + //max_size_ = item["target_size"].as(); + keep_ratio_ = item["keep_ratio"].as(); + target_size_ = item["target_size"].as>(); + if (item["keep_ratio"]) { + in_net_shape_ = image_shape; + } + } + + // Compute best resize scale for x-dimension, y-dimension + std::pair GenerateScale(const cv::Mat& im); + + virtual void Run(cv::Mat* im, ImageBlob* data); + + private: + int interp_; + bool keep_ratio_; + std::vector target_size_; + std::vector in_net_shape_; +}; + +// Models with FPN need input shape % stride == 0 +class PadStride : public PreprocessOp { + public: + virtual void Init(const YAML::Node& item, const std::vector image_shape) { + stride_ = item["stride"].as(); + } + + virtual void Run(cv::Mat* im, ImageBlob* data); + + private: + int stride_; +}; + +class Preprocessor { + public: + void Init(const YAML::Node& config_node, const std::vector image_shape) { + // initialize image info at first + ops_["InitInfo"] = std::make_shared(); + for (const auto& item : config_node) { + auto op_name = item["type"].as(); + + ops_[op_name] = CreateOp(op_name); + ops_[op_name]->Init(item, image_shape); + } + } + + std::shared_ptr CreateOp(const std::string& name) { + if (name == "Resize") { + return std::make_shared(); + } else if (name == "Permute") { + return std::make_shared(); + } else if (name == "NormalizeImage") { + return std::make_shared(); + } else if (name == "PadStride") { + // use PadStride instead of PadBatch + return std::make_shared(); + } + std::cerr << "can not find function of OP: " << name << " and return: nullptr" << std::endl; + return nullptr; + } + + void Run(cv::Mat* im, ImageBlob* data); + + public: + static const std::vector RUN_ORDER; + + private: + std::unordered_map> ops_; +}; + +} // namespace PaddleDetection + diff --git a/deploy/cpp/scripts/build.sh b/deploy/cpp/scripts/build.sh new file mode 100644 index 0000000..ed901d0 --- /dev/null +++ b/deploy/cpp/scripts/build.sh @@ -0,0 +1,79 @@ +# 是否使用GPU(即是否使用 CUDA) +WITH_GPU=OFF + +# 是否使用MKL or openblas,TX2需要设置为OFF +WITH_MKL=ON + +# 是否集成 TensorRT(仅WITH_GPU=ON 有效) +WITH_TENSORRT=OFF + +# paddle 预测库lib名称,由于不同平台不同版本预测库lib名称不同,请查看所下载的预测库中`paddle_inference/lib/`文件夹下`lib`的名称 +PADDLE_LIB_NAME=libpaddle_inference + +# TensorRT 的include路径 +TENSORRT_INC_DIR=/path/to/tensorrt/include + +# TensorRT 的lib路径 +TENSORRT_LIB_DIR=/path/to/tensorrt/lib + +# Paddle 预测库路径 +PADDLE_DIR=/path/to/paddle_inference + +# CUDA 的 lib 路径 +CUDA_LIB=/path/to/cuda/lib + +# CUDNN 的 lib 路径 +CUDNN_LIB=/path/to/cudnn/lib + + +MACHINE_TYPE=`uname -m` +echo "MACHINE_TYPE: "${MACHINE_TYPE} + + +if [ "$MACHINE_TYPE" = "x86_64" ] +then + echo "set OPENCV_DIR for x86_64" + # linux系统通过以下命令下载预编译的opencv + mkdir -p $(pwd)/deps && cd $(pwd)/deps + wget -c https://paddledet.bj.bcebos.com/data/opencv3.4.6gcc8.2ffmpeg.zip + unzip opencv3.4.6gcc8.2ffmpeg.zip && cd .. + + # set OPENCV_DIR + OPENCV_DIR=$(pwd)/deps/opencv3.4.6gcc8.2ffmpeg + +elif [ "$MACHINE_TYPE" = "aarch64" ] +then + echo "set OPENCV_DIR for aarch64" + # TX2平台通过以下命令下载预编译的opencv + mkdir -p $(pwd)/deps && cd $(pwd)/deps + wget -c https://paddlemodels.bj.bcebos.com/TX2_JetPack4.3_opencv_3.4.10_gcc7.5.0.zip + unzip TX2_JetPack4.3_opencv_3.4.10_gcc7.5.0.zip && cd .. + + # set OPENCV_DIR + OPENCV_DIR=$(pwd)/deps/TX2_JetPack4.3_opencv_3.4.10_gcc7.5.0/ + +else + echo "Please set OPENCV_DIR manually" +fi + +echo "OPENCV_DIR: "$OPENCV_DIR + +# 以下无需改动 +rm -rf build +mkdir -p build +cd build +cmake .. \ + -DWITH_GPU=${WITH_GPU} \ + -DWITH_MKL=${WITH_MKL} \ + -DWITH_TENSORRT=${WITH_TENSORRT} \ + -DTENSORRT_LIB_DIR=${TENSORRT_LIB_DIR} \ + -DTENSORRT_INC_DIR=${TENSORRT_INC_DIR} \ + -DPADDLE_DIR=${PADDLE_DIR} \ + -DWITH_STATIC_LIB=${WITH_STATIC_LIB} \ + -DCUDA_LIB=${CUDA_LIB} \ + -DCUDNN_LIB=${CUDNN_LIB} \ + -DOPENCV_DIR=${OPENCV_DIR} \ + -DPADDLE_LIB_NAME=${PADDLE_LIB_NAME} + +make +echo "make finished!" diff --git a/deploy/cpp/src/main.cc b/deploy/cpp/src/main.cc new file mode 100644 index 0000000..d221165 --- /dev/null +++ b/deploy/cpp/src/main.cc @@ -0,0 +1,223 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#elif LINUX +#include +#include +#endif + +#include "include/object_detector.h" + + +DEFINE_string(model_dir, "", "Path of inference model"); +DEFINE_string(image_path, "", "Path of input image"); +DEFINE_string(video_path, "", "Path of input video"); +DEFINE_bool(use_gpu, false, "Infering with GPU or CPU"); +DEFINE_bool(use_camera, false, "Use camera or not"); +DEFINE_string(run_mode, "fluid", "Mode of running(fluid/trt_fp32/trt_fp16/trt_int8)"); +DEFINE_int32(gpu_id, 0, "Device id of GPU to execute"); +DEFINE_int32(camera_id, -1, "Device id of camera to predict"); +DEFINE_bool(run_benchmark, false, "Whether to predict a image_file repeatedly for benchmark"); +DEFINE_double(threshold, 0.5, "Threshold of score."); +DEFINE_string(output_dir, "output", "Directory of output visualization files."); +DEFINE_bool(use_dynamic_shape, false, "Trt use dynamic shape or not"); +DEFINE_int32(trt_min_shape, 1, "Min shape of TRT DynamicShapeI"); +DEFINE_int32(trt_max_shape, 1280, "Max shape of TRT DynamicShapeI"); +DEFINE_int32(trt_opt_shape, 640, "Opt shape of TRT DynamicShapeI"); + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(OS_PATH_SEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static bool PathExists(const std::string& path){ +#ifdef _WIN32 + struct _stat buffer; + return (_stat(path.c_str(), &buffer) == 0); +#else + struct stat buffer; + return (stat(path.c_str(), &buffer) == 0); +#endif // !_WIN32 +} + +static void MkDir(const std::string& path) { + if (PathExists(path)) return; + int ret = 0; +#ifdef _WIN32 + ret = _mkdir(path.c_str()); +#else + ret = mkdir(path.c_str(), 0755); +#endif // !_WIN32 + if (ret != 0) { + std::string path_error(path); + path_error += " mkdir failed!"; + throw std::runtime_error(path_error); + } +} + +static void MkDirs(const std::string& path) { + if (path.empty()) return; + if (PathExists(path)) return; + + MkDirs(DirName(path)); + MkDir(path); +} + +void PredictVideo(const std::string& video_path, + PaddleDetection::ObjectDetector* det) { + // Open video + cv::VideoCapture capture; + if (FLAGS_camera_id != -1){ + capture.open(FLAGS_camera_id); + }else{ + capture.open(video_path.c_str()); + } + if (!capture.isOpened()) { + printf("can not open video : %s\n", video_path.c_str()); + return; + } + + // Get Video info : resolution, fps + int video_width = static_cast(capture.get(CV_CAP_PROP_FRAME_WIDTH)); + int video_height = static_cast(capture.get(CV_CAP_PROP_FRAME_HEIGHT)); + int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS)); + + // Create VideoWriter for output + cv::VideoWriter video_out; + std::string video_out_path = "output.mp4"; + video_out.open(video_out_path.c_str(), + 0x00000021, + video_fps, + cv::Size(video_width, video_height), + true); + if (!video_out.isOpened()) { + printf("create video writer failed!\n"); + return; + } + + std::vector result; + auto labels = det->GetLabelList(); + auto colormap = PaddleDetection::GenerateColorMap(labels.size()); + // Capture all frames and do inference + cv::Mat frame; + int frame_id = 0; + while (capture.read(frame)) { + if (frame.empty()) { + break; + } + det->Predict(frame, 0.5, 0, 1, false, &result); + cv::Mat out_im = PaddleDetection::VisualizeResult( + frame, result, labels, colormap); + for (const auto& item : result) { + printf("In frame id %d, we detect: class=%d confidence=%.2f rect=[%d %d %d %d]\n", + frame_id, + item.class_id, + item.confidence, + item.rect[0], + item.rect[1], + item.rect[2], + item.rect[3]); + } + video_out.write(out_im); + frame_id += 1; + } + capture.release(); + video_out.release(); +} + +void PredictImage(const std::string& image_path, + const double threshold, + const bool run_benchmark, + PaddleDetection::ObjectDetector* det, + const std::string& output_dir = "output") { + // Open input image as an opencv cv::Mat object + cv::Mat im = cv::imread(image_path, 1); + // Store all detected result + std::vector result; + if (run_benchmark) + { + det->Predict(im, threshold, 100, 100, run_benchmark, &result); + }else + { + det->Predict(im, 0.5, 0, 1, run_benchmark, &result); + for (const auto& item : result) { + printf("class=%d confidence=%.4f rect=[%d %d %d %d]\n", + item.class_id, + item.confidence, + item.rect[0], + item.rect[1], + item.rect[2], + item.rect[3]); + } + // Visualization result + auto labels = det->GetLabelList(); + auto colormap = PaddleDetection::GenerateColorMap(labels.size()); + cv::Mat vis_img = PaddleDetection::VisualizeResult( + im, result, labels, colormap); + std::vector compression_params; + compression_params.push_back(CV_IMWRITE_JPEG_QUALITY); + compression_params.push_back(95); + std::string output_path(output_dir); + if (output_dir.rfind(OS_PATH_SEP) != output_dir.size() - 1) { + output_path += OS_PATH_SEP; + } + output_path += "output.jpg"; + cv::imwrite(output_path, vis_img, compression_params); + printf("Visualized output saved as %s\n", output_path.c_str()); + } +} + +int main(int argc, char** argv) { + // Parsing command-line + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir.empty() + || (FLAGS_image_path.empty() && FLAGS_video_path.empty())) { + std::cout << "Usage: ./main --model_dir=/PATH/TO/INFERENCE_MODEL/ " + << "--image_path=/PATH/TO/INPUT/IMAGE/" << std::endl; + return -1; + } + if (!(FLAGS_run_mode == "fluid" || FLAGS_run_mode == "trt_fp32" + || FLAGS_run_mode == "trt_fp16" || FLAGS_run_mode == "trt_int8")) { + std::cout << "run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'."; + return -1; + } + // Load model and create a object detector + PaddleDetection::ObjectDetector det(FLAGS_model_dir, FLAGS_use_gpu, FLAGS_run_mode, + FLAGS_gpu_id, FLAGS_use_dynamic_shape, FLAGS_trt_min_shape, + FLAGS_trt_max_shape, FLAGS_trt_opt_shape); + // Do inference on input video or image + if (!FLAGS_video_path.empty() || FLAGS_use_camera) { + PredictVideo(FLAGS_video_path, &det); + } else if (!FLAGS_image_path.empty()) { + if (!PathExists(FLAGS_output_dir)) { + MkDirs(FLAGS_output_dir); + } + PredictImage(FLAGS_image_path, FLAGS_threshold, FLAGS_run_benchmark, &det, FLAGS_output_dir); + } + return 0; +} diff --git a/deploy/cpp/src/object_detector.cc b/deploy/cpp/src/object_detector.cc new file mode 100644 index 0000000..95b8dbb --- /dev/null +++ b/deploy/cpp/src/object_detector.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +// for setprecision +#include +#include +#include "include/object_detector.h" + + +using namespace paddle_infer; + +namespace PaddleDetection { + +// Load Model and create model predictor +void ObjectDetector::LoadModel(const std::string& model_dir, + bool use_gpu, + const int min_subgraph_size, + const int batch_size, + const std::string& run_mode, + const int gpu_id, + bool use_dynamic_shape, + const int trt_min_shape, + const int trt_max_shape, + const int trt_opt_shape) { + paddle_infer::Config config; + std::string prog_file = model_dir + OS_PATH_SEP + "model.pdmodel"; + std::string params_file = model_dir + OS_PATH_SEP + "model.pdiparams"; + config.SetModel(prog_file, params_file); + if (use_gpu) { + config.EnableUseGpu(200, gpu_id); + config.SwitchIrOptim(true); + // use tensorrt + bool use_calib_mode = false; + if (run_mode != "fluid") { + auto precision = paddle_infer::Config::Precision::kFloat32; + if (run_mode == "trt_fp32") { + precision = paddle_infer::Config::Precision::kFloat32; + } + else if (run_mode == "trt_fp16") { + precision = paddle_infer::Config::Precision::kHalf; + } + else if (run_mode == "trt_int8") { + precision = paddle_infer::Config::Precision::kInt8; + use_calib_mode = true; + } else { + printf("run_mode should be 'fluid', 'trt_fp32', 'trt_fp16' or 'trt_int8'"); + } + // set tensorrt + config.EnableTensorRtEngine( + 1 << 30, + batch_size, + min_subgraph_size, + precision, + false, + use_calib_mode); + + // set use dynamic shape + if (use_dynamic_shape) { + // set DynamicShsape for image tensor + const std::vector min_input_shape = {1, 3, trt_min_shape, trt_min_shape}; + const std::vector max_input_shape = {1, 3, trt_max_shape, trt_max_shape}; + const std::vector opt_input_shape = {1, 3, trt_opt_shape, trt_opt_shape}; + const std::map> map_min_input_shape = {{"image", min_input_shape}}; + const std::map> map_max_input_shape = {{"image", max_input_shape}}; + const std::map> map_opt_input_shape = {{"image", opt_input_shape}}; + + config.SetTRTDynamicShapeInfo(map_min_input_shape, + map_max_input_shape, + map_opt_input_shape); + std::cout << "TensorRT dynamic shape enabled" << std::endl; + } + } + + } else { + config.DisableGpu(); + } + config.SwitchUseFeedFetchOps(false); + config.DisableGlogInfo(); + // Memory optimization + config.EnableMemoryOptim(); + predictor_ = std::move(CreatePredictor(config)); +} + +// Visualiztion MaskDetector results +cv::Mat VisualizeResult(const cv::Mat& img, + const std::vector& results, + const std::vector& lable_list, + const std::vector& colormap) { + cv::Mat vis_img = img.clone(); + for (int i = 0; i < results.size(); ++i) { + int w = results[i].rect[1] - results[i].rect[0]; + int h = results[i].rect[3] - results[i].rect[2]; + cv::Rect roi = cv::Rect(results[i].rect[0], results[i].rect[2], w, h); + + // Configure color and text size + std::ostringstream oss; + oss << std::setiosflags(std::ios::fixed) << std::setprecision(4); + oss << lable_list[results[i].class_id] << " "; + oss << results[i].confidence; + std::string text = oss.str(); + int c1 = colormap[3 * results[i].class_id + 0]; + int c2 = colormap[3 * results[i].class_id + 1]; + int c3 = colormap[3 * results[i].class_id + 2]; + cv::Scalar roi_color = cv::Scalar(c1, c2, c3); + int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; + double font_scale = 0.5f; + float thickness = 0.5; + cv::Size text_size = cv::getTextSize(text, + font_face, + font_scale, + thickness, + nullptr); + cv::Point origin; + origin.x = roi.x; + origin.y = roi.y; + + // Configure text background + cv::Rect text_back = cv::Rect(results[i].rect[0], + results[i].rect[2] - text_size.height, + text_size.width, + text_size.height); + + // Draw roi object, text, and background + cv::rectangle(vis_img, roi, roi_color, 2); + cv::rectangle(vis_img, text_back, roi_color, -1); + cv::putText(vis_img, + text, + origin, + font_face, + font_scale, + cv::Scalar(255, 255, 255), + thickness); + } + return vis_img; +} + +void ObjectDetector::Preprocess(const cv::Mat& ori_im) { + // Clone the image : keep the original mat for postprocess + cv::Mat im = ori_im.clone(); + cv::cvtColor(im, im, cv::COLOR_BGR2RGB); + preprocessor_.Run(&im, &inputs_); +} + +void ObjectDetector::Postprocess( + const cv::Mat& raw_mat, + std::vector* result) { + result->clear(); + int rh = 1; + int rw = 1; + if (config_.arch_ == "Face") { + rh = raw_mat.rows; + rw = raw_mat.cols; + } + + int total_size = output_data_.size() / 6; + for (int j = 0; j < total_size; ++j) { + // Class id + int class_id = static_cast(round(output_data_[0 + j * 6])); + // Confidence score + float score = output_data_[1 + j * 6]; + int xmin = (output_data_[2 + j * 6] * rw); + int ymin = (output_data_[3 + j * 6] * rh); + int xmax = (output_data_[4 + j * 6] * rw); + int ymax = (output_data_[5 + j * 6] * rh); + int wd = xmax - xmin; + int hd = ymax - ymin; + if (score > threshold_ && class_id > -1) { + ObjectResult result_item; + result_item.rect = {xmin, xmax, ymin, ymax}; + result_item.class_id = class_id; + result_item.confidence = score; + result->push_back(result_item); + } + } +} + +void ObjectDetector::Predict(const cv::Mat& im, + const double threshold, + const int warmup, + const int repeats, + const bool run_benchmark, + std::vector* result) { + // Preprocess image + Preprocess(im); + // Prepare input tensor + auto input_names = predictor_->GetInputNames(); + for (const auto& tensor_name : input_names) { + auto in_tensor = predictor_->GetInputHandle(tensor_name); + if (tensor_name == "image") { + int rh = inputs_.in_net_shape_[0]; + int rw = inputs_.in_net_shape_[1]; + in_tensor->Reshape({1, 3, rh, rw}); + in_tensor->CopyFromCpu(inputs_.im_data_.data()); + } else if (tensor_name == "im_shape") { + in_tensor->Reshape({1, 2}); + in_tensor->CopyFromCpu(inputs_.im_shape_.data()); + } else if (tensor_name == "scale_factor") { + in_tensor->Reshape({1, 2}); + in_tensor->CopyFromCpu(inputs_.scale_factor_.data()); + } + } + // Run predictor + for (int i = 0; i < warmup; i++) + { + predictor_->Run(); + // Get output tensor + auto output_names = predictor_->GetOutputNames(); + auto out_tensor = predictor_->GetOutputHandle(output_names[0]); + std::vector output_shape = out_tensor->shape(); + // Calculate output length + int output_size = 1; + for (int j = 0; j < output_shape.size(); ++j) { + output_size *= output_shape[j]; + } + + if (output_size < 6) { + std::cerr << "[WARNING] No object detected." << std::endl; + } + output_data_.resize(output_size); + out_tensor->CopyToCpu(output_data_.data()); + } + + auto start = std::chrono::steady_clock::now(); + for (int i = 0; i < repeats; i++) + { + predictor_->Run(); + // Get output tensor + auto output_names = predictor_->GetOutputNames(); + auto out_tensor = predictor_->GetOutputHandle(output_names[0]); + std::vector output_shape = out_tensor->shape(); + // Calculate output length + int output_size = 1; + for (int j = 0; j < output_shape.size(); ++j) { + output_size *= output_shape[j]; + } + + if (output_size < 6) { + std::cerr << "[WARNING] No object detected." << std::endl; + } + output_data_.resize(output_size); + out_tensor->CopyToCpu(output_data_.data()); + } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration diff = end - start; + float ms = diff.count() / repeats * 1000; + printf("Inference: %f ms per batch image\n", ms); + // Postprocessing result + if(!run_benchmark) { + Postprocess(im, result); + } +} + +std::vector GenerateColorMap(int num_class) { + auto colormap = std::vector(3 * num_class, 0); + for (int i = 0; i < num_class; ++i) { + int j = 0; + int lab = i; + while (lab) { + colormap[i * 3] |= (((lab >> 0) & 1) << (7 - j)); + colormap[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)); + colormap[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)); + ++j; + lab >>= 3; + } + } + return colormap; +} + +} // namespace PaddleDetection diff --git a/deploy/cpp/src/preprocess_op.cc b/deploy/cpp/src/preprocess_op.cc new file mode 100644 index 0000000..6a2be41 --- /dev/null +++ b/deploy/cpp/src/preprocess_op.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "include/preprocess_op.h" + +namespace PaddleDetection { + +void InitInfo::Run(cv::Mat* im, ImageBlob* data) { + data->im_shape_ = { + static_cast(im->rows), + static_cast(im->cols) + }; + data->scale_factor_ = {1., 1.}; + data->in_net_shape_ = { + static_cast(im->rows), + static_cast(im->cols) + }; +} + +void NormalizeImage::Run(cv::Mat* im, ImageBlob* data) { + double e = 1.0; + if (is_scale_) { + e /= 255.0; + } + (*im).convertTo(*im, CV_32FC3, e); + for (int h = 0; h < im->rows; h++) { + for (int w = 0; w < im->cols; w++) { + im->at(h, w)[0] = + (im->at(h, w)[0] - mean_[0] ) / scale_[0]; + im->at(h, w)[1] = + (im->at(h, w)[1] - mean_[1] ) / scale_[1]; + im->at(h, w)[2] = + (im->at(h, w)[2] - mean_[2] ) / scale_[2]; + } + } +} + +void Permute::Run(cv::Mat* im, ImageBlob* data) { + int rh = im->rows; + int rw = im->cols; + int rc = im->channels(); + (data->im_data_).resize(rc * rh * rw); + float* base = (data->im_data_).data(); + for (int i = 0; i < rc; ++i) { + cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, base + i * rh * rw), i); + } +} + +void Resize::Run(cv::Mat* im, ImageBlob* data) { + auto resize_scale = GenerateScale(*im); + data->im_shape_ = { + static_cast(im->cols * resize_scale.first), + static_cast(im->rows * resize_scale.second) + }; + data->in_net_shape_ = { + static_cast(im->cols * resize_scale.first), + static_cast(im->rows * resize_scale.second) + }; + cv::resize( + *im, *im, cv::Size(), resize_scale.first, resize_scale.second, interp_); + data->im_shape_ = { + static_cast(im->rows), + static_cast(im->cols), + }; + data->scale_factor_ = { + resize_scale.second, + resize_scale.first, + }; +} + +std::pair Resize::GenerateScale(const cv::Mat& im) { + std::pair resize_scale; + int origin_w = im.cols; + int origin_h = im.rows; + + if (keep_ratio_) { + int im_size_max = std::max(origin_w, origin_h); + int im_size_min = std::min(origin_w, origin_h); + int target_size_max = *std::max_element(target_size_.begin(), target_size_.end()); + int target_size_min = *std::min_element(target_size_.begin(), target_size_.end()); + float scale_min = + static_cast(target_size_min) / static_cast(im_size_min); + float scale_max = + static_cast(target_size_max) / static_cast(im_size_max); + float scale_ratio = std::min(scale_min, scale_max); + resize_scale = {scale_ratio, scale_ratio}; + } else { + resize_scale.first = + static_cast(target_size_[1]) / static_cast(origin_w); + resize_scale.second = + static_cast(target_size_[0]) / static_cast(origin_h); + } + return resize_scale; +} + +void PadStride::Run(cv::Mat* im, ImageBlob* data) { + if (stride_ <= 0) { + return; + } + int rc = im->channels(); + int rh = im->rows; + int rw = im->cols; + int nh = (rh / stride_) * stride_ + (rh % stride_ != 0) * stride_; + int nw = (rw / stride_) * stride_ + (rw % stride_ != 0) * stride_; + cv::copyMakeBorder( + *im, + *im, + 0, + nh - rh, + 0, + nw - rw, + cv::BORDER_CONSTANT, + cv::Scalar(0)); + data->in_net_shape_ = { + static_cast(im->rows), + static_cast(im->cols), + }; + +} + + +// Preprocessor op running order +const std::vector Preprocessor::RUN_ORDER = { + "InitInfo", "Resize", "NormalizeImage", "PadStride", "Permute" +}; + +void Preprocessor::Run(cv::Mat* im, ImageBlob* data) { + for (const auto& name : RUN_ORDER) { + if (ops_.find(name) != ops_.end()) { + ops_[name]->Run(im, data); + } + } +} + +} // namespace PaddleDetection diff --git a/deploy/imgs/input_shape.png b/deploy/imgs/input_shape.png new file mode 100644 index 0000000..1148116 Binary files /dev/null and b/deploy/imgs/input_shape.png differ diff --git a/deploy/python/README.md b/deploy/python/README.md new file mode 100644 index 0000000..e0a5a32 --- /dev/null +++ b/deploy/python/README.md @@ -0,0 +1,80 @@ +# Python端预测部署 + +Python预测可以使用`tools/infer.py`,此种方式依赖PaddleDetection源码;也可以使用本篇教程预测方式,先将模型导出,使用一个独立的文件进行预测。 + + +本篇教程使用AnalysisPredictor对[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/EXPORT_MODEL.md)进行高性能预测。 + +在PaddlePaddle中预测引擎和训练引擎底层有着不同的优化方法, 预测引擎使用了AnalysisPredictor,专门针对推理进行了优化,是基于[C++预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/native_infer.html)的Python接口,该引擎可以对模型进行多项图优化,减少不必要的内存拷贝。如果用户在部署已训练模型的过程中对性能有较高的要求,我们提供了独立于PaddleDetection的预测脚本,方便用户直接集成部署。 + + +主要包含两个步骤: + +- 导出预测模型 +- 基于Python的预测 + +## 1. 导出预测模型 + +PaddleDetection在训练过程包括网络的前向和优化器相关参数,而在部署过程中,我们只需要前向参数,具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/deploy/EXPORT_MODEL.md) + +导出后目录下,包括`infer_cfg.yml`, `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`四个文件。 + +## 2. 基于python的预测 + +### 2.1 安装依赖 + - `PaddlePaddle`的安装: + 请点击[官方安装文档](https://paddlepaddle.org.cn/install/quick) 选择适合的方式,版本为2.0rc1以上即可 + - 切换到`PaddleDetection`代码库根目录,执行`pip install -r requirements.txt`安装其它依赖 + +### 2.2 执行预测程序 +在终端输入以下命令进行预测: + +```bash +python deploy/python/infer.py --model_dir=/path/to/models --image_file=/path/to/image +--use_gpu=(False/True) +``` + +参数说明如下: + +| 参数 | 是否必须|含义 | +|-------|-------|----------| +| --model_dir | Yes|上述导出的模型路径 | +| --image_file | Option |需要预测的图片 | +| --video_file | Option |需要预测的视频 | +| --camera_id | Option | 用来预测的摄像头ID,默认为-1(表示不使用摄像头预测,可设置为:0 - (摄像头数目-1) ),预测过程中在可视化界面按`q`退出输出预测结果到:output/output.mp4| +| --use_gpu |No|是否GPU,默认为False| +| --run_mode |No|使用GPU时,默认为fluid, 可选(fluid/trt_fp32/trt_fp16/trt_int8)| +| --threshold |No|预测得分的阈值,默认为0.5| +| --output_dir |No|可视化结果保存的根目录,默认为output/| +| --run_benchmark |No|是否运行benchmark,同时需指定--image_file| + +说明: + +- run_mode:fluid代表使用AnalysisPredictor,精度float32来推理,其他参数指用AnalysisPredictor,TensorRT不同精度来推理。 +- PaddlePaddle默认的GPU安装包(<=1.7),不支持基于TensorRT进行预测,如果想基于TensorRT加速预测,需要自行编译,详细可参考[预测库编译教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/paddle_tensorrt_infer.html)。 + +## 3. 部署性能对比测试 +对比AnalysisPredictor相对Executor的推理速度 + +### 3.1 测试环境: + +- CUDA 9.0 +- CUDNN 7.5 +- PaddlePaddle 1.71 +- GPU: Tesla P40 + +### 3.2 测试方式: + +- Batch Size=1 +- 去掉前100轮warmup时间,测试100轮的平均时间,单位ms/image,只计算模型运行时间,不包括数据的处理和拷贝。 + + +### 3.3 测试结果 + +|模型 | AnalysisPredictor | Executor | 输入| +|---|----|---|---| +| YOLOv3-MobileNetv1 | 15.20 | 19.54 | 608*608 +| faster_rcnn_r50_fpn_1x | 50.05 | 69.58 |800*1088 +| faster_rcnn_r50_1x | 326.11 | 347.22 | 800*1067 +| mask_rcnn_r50_fpn_1x | 67.49 | 91.02 | 800*1088 +| mask_rcnn_r50_1x | 326.11 | 350.94 | 800*1067 diff --git a/deploy/python/infer.py b/deploy/python/infer.py new file mode 100644 index 0000000..5bfd545 --- /dev/null +++ b/deploy/python/infer.py @@ -0,0 +1,551 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import time +import yaml +import ast +from functools import reduce + +from PIL import Image +import cv2 +import numpy as np +import paddle +from preprocess import preprocess, Resize, NormalizeImage, Permute, PadStride +from visualize import visualize_box_mask +from paddle.inference import Config +from paddle.inference import create_predictor + +# Global dictionary +SUPPORT_MODELS = { + 'YOLO', + 'RCNN', + 'SSD', + 'FCOS', + 'SOLOv2', + 'TTFNet', +} + + +class Detector(object): + """ + Args: + config (object): config of model, defined by `Config(model_dir)` + model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml + use_gpu (bool): whether use gpu + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + threshold (float): threshold to reserve the result for output. + """ + + def __init__(self, + pred_config, + model_dir, + use_gpu=False, + run_mode='fluid', + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + threshold=0.5): + self.pred_config = pred_config + self.predictor = load_predictor( + model_dir, + run_mode=run_mode, + min_subgraph_size=self.pred_config.min_subgraph_size, + use_gpu=use_gpu, + use_dynamic_shape=use_dynamic_shape, + trt_min_shape=trt_min_shape, + trt_max_shape=trt_max_shape, + trt_opt_shape=trt_opt_shape) + + def preprocess(self, im): + preprocess_ops = [] + for op_info in self.pred_config.preprocess_infos: + new_op_info = op_info.copy() + op_type = new_op_info.pop('type') + preprocess_ops.append(eval(op_type)(**new_op_info)) + im, im_info = preprocess(im, preprocess_ops, + self.pred_config.input_shape) + inputs = create_inputs(im, im_info) + return inputs + + def postprocess(self, np_boxes, np_masks, inputs, threshold=0.5): + # postprocess output of predictor + results = {} + if self.pred_config.arch in ['Face']: + h, w = inputs['im_shape'] + scale_y, scale_x = inputs['scale_factor'] + w, h = float(h) / scale_y, float(w) / scale_x + np_boxes[:, 2] *= h + np_boxes[:, 3] *= w + np_boxes[:, 4] *= h + np_boxes[:, 5] *= w + results['boxes'] = np_boxes + if np_masks is not None: + results['masks'] = np_masks + return results + + def predict(self, + image, + threshold=0.5, + warmup=0, + repeats=1, + run_benchmark=False): + ''' + Args: + image (str/np.ndarray): path of image/ np.ndarray read by cv2 + threshold (float): threshold of predicted box' score + Returns: + results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, + matix element:[class, score, x_min, y_min, x_max, y_max] + MaskRCNN's results include 'masks': np.ndarray: + shape: [N, im_h, im_w] + ''' + inputs = self.preprocess(image) + np_boxes, np_masks = None, None + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle(input_names[i]) + input_tensor.copy_from_cpu(inputs[input_names[i]]) + + for i in range(warmup): + self.predictor.run() + output_names = self.predictor.get_output_names() + boxes_tensor = self.predictor.get_output_handle(output_names[0]) + np_boxes = boxes_tensor.copy_to_cpu() + if self.pred_config.mask: + masks_tensor = self.predictor.get_output_handle(output_names[2]) + np_masks = masks_tensor.copy_to_cpu() + + t1 = time.time() + for i in range(repeats): + self.predictor.run() + output_names = self.predictor.get_output_names() + boxes_tensor = self.predictor.get_output_handle(output_names[0]) + np_boxes = boxes_tensor.copy_to_cpu() + if self.pred_config.mask: + masks_tensor = self.predictor.get_output_handle(output_names[2]) + np_masks = masks_tensor.copy_to_cpu() + t2 = time.time() + ms = (t2 - t1) * 1000.0 / repeats + print("Inference: {} ms per batch image".format(ms)) + + # do not perform postprocess in benchmark mode + results = [] + if not run_benchmark: + if reduce(lambda x, y: x * y, np_boxes.shape) < 6: + print('[WARNNING] No object detected.') + results = {'boxes': np.array([])} + else: + results = self.postprocess( + np_boxes, np_masks, inputs, threshold=threshold) + + return results + + +class DetectorSOLOv2(Detector): + """ + Args: + config (object): config of model, defined by `Config(model_dir)` + model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml + use_gpu (bool): whether use gpu + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt + threshold (float): threshold to reserve the result for output. + """ + + def __init__(self, + pred_config, + model_dir, + use_gpu=False, + run_mode='fluid', + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + threshold=0.5): + self.pred_config = pred_config + self.predictor = load_predictor( + model_dir, + run_mode=run_mode, + min_subgraph_size=self.pred_config.min_subgraph_size, + use_gpu=use_gpu, + use_dynamic_shape=use_dynamic_shape, + trt_min_shape=trt_min_shape, + trt_max_shape=trt_max_shape, + trt_opt_shape=trt_opt_shape) + + def predict(self, + image, + threshold=0.5, + warmup=0, + repeats=1, + run_benchmark=False): + ''' + Args: + image (str/np.ndarray): path of image/ np.ndarray read by cv2 + threshold (float): threshold of predicted box' score + Returns: + results (dict): 'segm': np.ndarray,shape:[N, im_h, im_w] + 'cate_label': label of segm, shape:[N] + 'cate_score': confidence score of segm, shape:[N] + ''' + inputs = self.preprocess(image) + np_label, np_score, np_segms = None, None, None + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle(input_names[i]) + input_tensor.copy_from_cpu(inputs[input_names[i]]) + + for i in range(warmup): + self.predictor.run() + output_names = self.predictor.get_output_names() + np_label = self.predictor.get_output_handle(output_names[ + 1]).copy_to_cpu() + np_score = self.predictor.get_output_handle(output_names[ + 2]).copy_to_cpu() + np_segms = self.predictor.get_output_handle(output_names[ + 3]).copy_to_cpu() + + t1 = time.time() + for i in range(repeats): + self.predictor.run() + output_names = self.predictor.get_output_names() + np_label = self.predictor.get_output_handle(output_names[ + 1]).copy_to_cpu() + np_score = self.predictor.get_output_handle(output_names[ + 2]).copy_to_cpu() + np_segms = self.predictor.get_output_handle(output_names[ + 3]).copy_to_cpu() + t2 = time.time() + ms = (t2 - t1) * 1000.0 / repeats + print("Inference: {} ms per batch image".format(ms)) + + # do not perform postprocess in benchmark mode + results = [] + if not run_benchmark: + return dict(segm=np_segms, label=np_label, score=np_score) + return results + + +def create_inputs(im, im_info): + """generate input for different model type + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + model_arch (str): model type + Returns: + inputs (dict): input of model + """ + inputs = {} + inputs['image'] = np.array((im, )).astype('float32') + inputs['im_shape'] = np.array((im_info['im_shape'], )).astype('float32') + inputs['scale_factor'] = np.array( + (im_info['scale_factor'], )).astype('float32') + + return inputs + + +class PredictConfig(): + """set config of preprocess, postprocess and visualize + Args: + model_dir (str): root path of model.yml + """ + + def __init__(self, model_dir): + # parsing Yaml config for Preprocess + deploy_file = os.path.join(model_dir, 'infer_cfg.yml') + with open(deploy_file) as f: + yml_conf = yaml.safe_load(f) + self.check_model(yml_conf) + self.arch = yml_conf['arch'] + self.preprocess_infos = yml_conf['Preprocess'] + self.min_subgraph_size = yml_conf['min_subgraph_size'] + self.labels = yml_conf['label_list'] + self.mask = False + if 'mask' in yml_conf: + self.mask = yml_conf['mask'] + self.input_shape = yml_conf['image_shape'] + self.print_config() + + def check_model(self, yml_conf): + """ + Raises: + ValueError: loaded model not in supported model type + """ + for support_model in SUPPORT_MODELS: + if support_model in yml_conf['arch']: + return True + raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[ + 'arch'], SUPPORT_MODELS)) + + def print_config(self): + print('----------- Model Configuration -----------') + print('%s: %s' % ('Model Arch', self.arch)) + print('%s: ' % ('Transform Order')) + for op_info in self.preprocess_infos: + print('--%s: %s' % ('transform op', op_info['type'])) + print('--------------------------------------------') + + +def load_predictor(model_dir, + run_mode='fluid', + batch_size=1, + use_gpu=False, + min_subgraph_size=3, + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640): + """set AnalysisConfig, generate AnalysisPredictor + Args: + model_dir (str): root path of __model__ and __params__ + use_gpu (bool): whether use gpu + run_mode (str): mode of running(fluid/trt_fp32/trt_fp16/trt_int8) + use_dynamic_shape (bool): use dynamic shape or not + trt_min_shape (int): min shape for dynamic shape in trt + trt_max_shape (int): max shape for dynamic shape in trt + trt_opt_shape (int): opt shape for dynamic shape in trt + Returns: + predictor (PaddlePredictor): AnalysisPredictor + Raises: + ValueError: predict by TensorRT need use_gpu == True. + """ + if not use_gpu and not run_mode == 'fluid': + raise ValueError( + "Predict by TensorRT mode: {}, expect use_gpu==True, but use_gpu == {}" + .format(run_mode, use_gpu)) + use_calib_mode = True if run_mode == 'trt_int8' else False + config = Config( + os.path.join(model_dir, 'model.pdmodel'), + os.path.join(model_dir, 'model.pdiparams')) + precision_map = { + 'trt_int8': Config.Precision.Int8, + 'trt_fp32': Config.Precision.Float32, + 'trt_fp16': Config.Precision.Half + } + if use_gpu: + # initial GPU memory(M), device ID + config.enable_use_gpu(200, 0) + # optimize graph and fuse op + config.switch_ir_optim(True) + else: + config.disable_gpu() + + if run_mode in precision_map.keys(): + config.enable_tensorrt_engine( + workspace_size=1 << 10, + max_batch_size=batch_size, + min_subgraph_size=min_subgraph_size, + precision_mode=precision_map[run_mode], + use_static=False, + use_calib_mode=use_calib_mode) + + if use_dynamic_shape: + print('use_dynamic_shape') + min_input_shape = {'image': [1, 3, trt_min_shape, trt_min_shape]} + max_input_shape = {'image': [1, 3, trt_max_shape, trt_max_shape]} + opt_input_shape = {'image': [1, 3, trt_opt_shape, trt_opt_shape]} + config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, + opt_input_shape) + print('trt set dynamic shape done!') + + # disable print log when predict + config.disable_glog_info() + # enable shared memory + config.enable_memory_optim() + # disable feed, fetch OP, needed by zero_copy_run + config.switch_use_feed_fetch_ops(False) + predictor = create_predictor(config) + return predictor + + +def visualize(image_file, results, labels, output_dir='output/', threshold=0.5): + # visualize the predict result + im = visualize_box_mask(image_file, results, labels, threshold=threshold) + img_name = os.path.split(image_file)[-1] + if not os.path.exists(output_dir): + os.makedirs(output_dir) + out_path = os.path.join(output_dir, img_name) + im.save(out_path, quality=95) + print("save result to: " + out_path) + + +def print_arguments(args): + print('----------- Running Arguments -----------') + for arg, value in sorted(vars(args).items()): + print('%s: %s' % (arg, value)) + print('------------------------------------------') + + +def predict_image(detector): + if FLAGS.run_benchmark: + detector.predict( + FLAGS.image_file, + FLAGS.threshold, + warmup=100, + repeats=100, + run_benchmark=True) + else: + results = detector.predict(FLAGS.image_file, FLAGS.threshold) + visualize( + FLAGS.image_file, + results, + detector.pred_config.labels, + output_dir=FLAGS.output_dir, + threshold=FLAGS.threshold) + + +def predict_video(detector, camera_id): + if camera_id != -1: + capture = cv2.VideoCapture(camera_id) + video_name = 'output.mp4' + else: + capture = cv2.VideoCapture(FLAGS.video_file) + video_name = os.path.split(FLAGS.video_file)[-1] + fps = 30 + width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) + # yapf: disable + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + # yapf: enable + if not os.path.exists(FLAGS.output_dir): + os.makedirs(FLAGS.output_dir) + out_path = os.path.join(FLAGS.output_dir, video_name) + writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height)) + index = 1 + while (1): + ret, frame = capture.read() + if not ret: + break + print('detect frame:%d' % (index)) + index += 1 + results = detector.predict(frame, FLAGS.threshold) + im = visualize_box_mask( + frame, + results, + detector.pred_config.labels, + threshold=FLAGS.threshold) + im = np.array(im) + writer.write(im) + if camera_id != -1: + cv2.imshow('Mask Detection', im) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + writer.release() + + +def main(): + pred_config = PredictConfig(FLAGS.model_dir) + detector = Detector( + pred_config, + FLAGS.model_dir, + use_gpu=FLAGS.use_gpu, + run_mode=FLAGS.run_mode, + use_dynamic_shape=FLAGS.use_dynamic_shape, + trt_min_shape=FLAGS.trt_min_shape, + trt_max_shape=FLAGS.trt_max_shape, + trt_opt_shape=FLAGS.trt_opt_shape) + if pred_config.arch == 'SOLOv2': + detector = DetectorSOLOv2( + pred_config, + FLAGS.model_dir, + use_gpu=FLAGS.use_gpu, + run_mode=FLAGS.run_mode, + use_dynamic_shape=FLAGS.use_dynamic_shape, + trt_min_shape=FLAGS.trt_min_shape, + trt_max_shape=FLAGS.trt_max_shape, + trt_opt_shape=FLAGS.trt_opt_shape) + # predict from image + if FLAGS.image_file != '': + predict_image(detector) + # predict from video file or camera video stream + if FLAGS.video_file != '' or FLAGS.camera_id != -1: + predict_video(detector, FLAGS.camera_id) + + +if __name__ == '__main__': + paddle.enable_static() + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--model_dir", + type=str, + default=None, + help=("Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py."), + required=True) + parser.add_argument( + "--image_file", type=str, default='', help="Path of image file.") + parser.add_argument( + "--video_file", type=str, default='', help="Path of video file.") + parser.add_argument( + "--camera_id", + type=int, + default=-1, + help="device id of camera to predict.") + parser.add_argument( + "--run_mode", + type=str, + default='fluid', + help="mode of running(fluid/trt_fp32/trt_fp16/trt_int8)") + parser.add_argument( + "--use_gpu", + type=ast.literal_eval, + default=False, + help="Whether to predict with GPU.") + parser.add_argument( + "--run_benchmark", + type=ast.literal_eval, + default=False, + help="Whether to predict a image_file repeatedly for benchmark") + parser.add_argument( + "--threshold", type=float, default=0.5, help="Threshold of score.") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory of output visualization files.") + parser.add_argument( + "--use_dynamic_shape", + type=ast.literal_eval, + default=False, + help="Dynamic_shape for TensorRT.") + parser.add_argument( + "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") + parser.add_argument( + "--trt_max_shape", + type=int, + default=1280, + help="max_shape for TensorRT.") + parser.add_argument( + "--trt_opt_shape", + type=int, + default=640, + help="opt_shape for TensorRT.") + + FLAGS = parser.parse_args() + print_arguments(FLAGS) + if FLAGS.image_file != '' and FLAGS.video_file != '': + assert "Cannot predict image and video at the same time" + + main() diff --git a/deploy/python/preprocess.py b/deploy/python/preprocess.py new file mode 100644 index 0000000..371b117 --- /dev/null +++ b/deploy/python/preprocess.py @@ -0,0 +1,214 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from PIL import Image +import cv2 +import numpy as np + + +def decode_image(im_file, im_info): + """read rgb image + Args: + im_file (str|np.ndarray): input can be image path or np.ndarray + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + if isinstance(im_file, str): + with open(im_file, 'rb') as f: + im_read = f.read() + data = np.frombuffer(im_read, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + else: + im = im_file + im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32) + im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32) + return im, im_info + + +class Resize(object): + """resize image by target_size and max_size + Args: + target_size (int): the target size of image + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): method of resize + """ + + def __init__( + self, + target_size, + keep_ratio=True, + interp=cv2.INTER_LINEAR, ): + if isinstance(target_size, int): + target_size = [target_size, target_size] + self.target_size = target_size + self.keep_ratio = keep_ratio + self.interp = interp + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + assert len(self.target_size) == 2 + assert self.target_size[0] > 0 and self.target_size[1] > 0 + im_channel = im.shape[2] + im_scale_y, im_scale_x = self.generate_scale(im) + # set image_shape + im_info['input_shape'][1] = int(im_scale_y * im.shape[0]) + im_info['input_shape'][2] = int(im_scale_x * im.shape[1]) + im = cv2.resize( + im, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + im_info['im_shape'] = np.array(im.shape[:2]).astype('float32') + im_info['scale_factor'] = np.array( + [im_scale_y, im_scale_x]).astype('float32') + return im, im_info + + def generate_scale(self, im): + """ + Args: + im (np.ndarray): image (np.ndarray) + Returns: + im_scale_x: the resize ratio of X + im_scale_y: the resize ratio of Y + """ + origin_shape = im.shape[:2] + im_c = im.shape[2] + if self.keep_ratio: + im_size_min = np.min(origin_shape) + im_size_max = np.max(origin_shape) + target_size_min = np.min(self.target_size) + target_size_max = np.max(self.target_size) + im_scale = float(target_size_min) / float(im_size_min) + if np.round(im_scale * im_size_max) > target_size_max: + im_scale = float(target_size_max) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = self.target_size + im_scale_y = resize_h / float(origin_shape[0]) + im_scale_x = resize_w / float(origin_shape[1]) + return im_scale_y, im_scale_x + + +class NormalizeImage(object): + """normalize image + Args: + mean (list): im - mean + std (list): im / std + is_scale (bool): whether need im / 255 + is_channel_first (bool): if True: image shape is CHW, else: HWC + """ + + def __init__(self, mean, std, is_scale=True): + self.mean = mean + self.std = std + self.is_scale = is_scale + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + im = im.astype(np.float32, copy=False) + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + + if self.is_scale: + im = im / 255.0 + + im -= mean + im /= std + return im, im_info + + +class Permute(object): + """permute image + Args: + to_bgr (bool): whether convert RGB to BGR + channel_first (bool): whether convert HWC to CHW + """ + + def __init__(self, ): + super(Permute, self).__init__() + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + im = im.transpose((2, 0, 1)).copy() + return im, im_info + + +class PadStride(object): + """ padding image for model with FPN , instead PadBatch(pad_to_stride, pad_gt) in original config + Args: + stride (bool): model with FPN need image shape % stride == 0 + """ + + def __init__(self, stride=0): + self.coarsest_stride = stride + + def __call__(self, im, im_info): + """ + Args: + im (np.ndarray): image (np.ndarray) + im_info (dict): info of image + Returns: + im (np.ndarray): processed image (np.ndarray) + im_info (dict): info of processed image + """ + coarsest_stride = self.coarsest_stride + if coarsest_stride <= 0: + return im, im_info + im_c, im_h, im_w = im.shape + pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride) + pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride) + padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + return padding_im, im_info + + +def preprocess(im, preprocess_ops, input_shape): + # process image by preprocess_ops + im_info = { + 'scale_factor': np.array( + [1., 1.], dtype=np.float32), + 'im_shape': None, + 'input_shape': input_shape, + } + im, im_info = decode_image(im, im_info) + for operator in preprocess_ops: + im, im_info = operator(im, im_info) + return im, im_info diff --git a/deploy/python/visualize.py b/deploy/python/visualize.py new file mode 100644 index 0000000..6093572 --- /dev/null +++ b/deploy/python/visualize.py @@ -0,0 +1,207 @@ +# coding: utf-8 +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import cv2 +import numpy as np +from PIL import Image, ImageDraw +from scipy import ndimage + + +def visualize_box_mask(im, results, labels, threshold=0.5): + """ + Args: + im (str/np.ndarray): path of image/np.ndarray read by cv2 + results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, + matix element:[class, score, x_min, y_min, x_max, y_max] + MaskRCNN's results include 'masks': np.ndarray: + shape:[N, im_h, im_w] + labels (list): labels:['class1', ..., 'classn'] + threshold (float): Threshold of score. + Returns: + im (PIL.Image.Image): visualized image + """ + if isinstance(im, str): + im = Image.open(im).convert('RGB') + else: + im = Image.fromarray(im) + if 'masks' in results and 'boxes' in results: + im = draw_mask( + im, results['boxes'], results['masks'], labels, threshold=threshold) + if 'boxes' in results: + im = draw_box(im, results['boxes'], labels, threshold=threshold) + if 'segm' in results: + im = draw_segm( + im, + results['segm'], + results['label'], + results['score'], + labels, + threshold=threshold) + return im + + +def get_color_map_list(num_classes): + """ + Args: + num_classes (int): number of class + Returns: + color_map (list): RGB color list + """ + color_map = num_classes * [0, 0, 0] + for i in range(0, num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + return color_map + + +def draw_mask(im, np_boxes, np_masks, labels, threshold=0.5): + """ + Args: + im (PIL.Image.Image): PIL image + np_boxes (np.ndarray): shape:[N,6], N: number of box, + matix element:[class, score, x_min, y_min, x_max, y_max] + np_masks (np.ndarray): shape:[N, im_h, im_w] + labels (list): labels:['class1', ..., 'classn'] + threshold (float): threshold of mask + Returns: + im (PIL.Image.Image): visualized image + """ + color_list = get_color_map_list(len(labels)) + w_ratio = 0.4 + alpha = 0.7 + im = np.array(im).astype('float32') + clsid2color = {} + expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1) + np_boxes = np_boxes[expect_boxes, :] + np_masks = np_masks[expect_boxes, :, :] + for i in range(len(np_masks)): + clsid, score = int(np_boxes[i][0]), np_boxes[i][1] + mask = np_masks[i] + if clsid not in clsid2color: + clsid2color[clsid] = color_list[clsid] + color_mask = clsid2color[clsid] + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + color_mask = np.array(color_mask) + im[idx[0], idx[1], :] *= 1.0 - alpha + im[idx[0], idx[1], :] += alpha * color_mask + return Image.fromarray(im.astype('uint8')) + + +def draw_box(im, np_boxes, labels, threshold=0.5): + """ + Args: + im (PIL.Image.Image): PIL image + np_boxes (np.ndarray): shape:[N,6], N: number of box, + matix element:[class, score, x_min, y_min, x_max, y_max] + labels (list): labels:['class1', ..., 'classn'] + threshold (float): threshold of box + Returns: + im (PIL.Image.Image): visualized image + """ + draw_thickness = min(im.size) // 320 + draw = ImageDraw.Draw(im) + clsid2color = {} + color_list = get_color_map_list(len(labels)) + expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1) + np_boxes = np_boxes[expect_boxes, :] + + for dt in np_boxes: + clsid, bbox, score = int(dt[0]), dt[2:], dt[1] + xmin, ymin, xmax, ymax = bbox + print('class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],' + 'right_bottom:[{:.2f},{:.2f}]'.format( + int(clsid), score, xmin, ymin, xmax, ymax)) + w = xmax - xmin + h = ymax - ymin + if clsid not in clsid2color: + clsid2color[clsid] = color_list[clsid] + color = tuple(clsid2color[clsid]) + + # draw bbox + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=draw_thickness, + fill=color) + + # draw label + text = "{} {:.4f}".format(labels[clsid], score) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + return im + + +def draw_segm(im, + np_segms, + np_label, + np_score, + labels, + threshold=0.5, + alpha=0.7): + """ + Draw segmentation on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = get_color_map_list(len(labels)) + im = np.array(im).astype('float32') + clsid2color = {} + np_segms = np_segms.astype(np.uint8) + for i in range(np_segms.shape[0]): + mask, score, clsid = np_segms[i], np_score[i], np_label[i] + if score < threshold: + continue + + if clsid not in clsid2color: + clsid2color[clsid] = color_list[clsid] + color_mask = clsid2color[clsid] + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + color_mask = np.array(color_mask) + im[idx[0], idx[1], :] *= 1.0 - alpha + im[idx[0], idx[1], :] += alpha * color_mask + sum_x = np.sum(mask, axis=0) + x = np.where(sum_x > 0.5)[0] + sum_y = np.sum(mask, axis=1) + y = np.where(sum_y > 0.5)[0] + x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] + cv2.rectangle(im, (x0, y0), (x1, y1), + tuple(color_mask.astype('int32').tolist()), 1) + bbox_text = '%s %.2f' % (labels[clsid], score) + t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] + cv2.rectangle(im, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3), + tuple(color_mask.astype('int32').tolist()), -1) + cv2.putText( + im, + bbox_text, (x0, y0 - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 0.3, (0, 0, 0), + 1, + lineType=cv2.LINE_AA) + return Image.fromarray(im.astype('uint8')) diff --git a/deploy/serving/README.md b/deploy/serving/README.md new file mode 100644 index 0000000..0d0122b --- /dev/null +++ b/deploy/serving/README.md @@ -0,0 +1,115 @@ +# 服务端预测部署 + +`PaddleDetection`训练出来的模型可以使用[Serving](https://github.com/PaddlePaddle/Serving) 部署在服务端。 +本教程以在COCO数据集上用`configs/yolov3/yolov3_darknet53_270e_coco.yml`算法训练的模型进行部署。 +预训练模型权重文件为[yolov3_darknet53_270e_coco.pdparams](https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams) 。 + +## 1. 首先验证模型 +``` +python tools/infer.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml --infer_img=demo/000000014439.jpg -o use_gpu=True weights=https://paddledet.bj.bcebos.com/models/yolov3_darknet53_270e_coco.pdparams --infer_img=demo/000000014439.jpg +``` + +## 2. 安装 paddle serving +请参考[PaddleServing](https://github.com/PaddlePaddle/Serving/tree/v0.5.0) 中安装教程安装 + +## 3. 导出模型 +PaddleDetection在训练过程包括网络的前向和优化器相关参数,而在部署过程中,我们只需要前向参数,具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.0/docs/advanced_tutorials/deploy/EXPORT_MODEL.md) + +``` +python tools/export_model.py -c configs/yolov3/yolov3_darknet53_270e_coco.yml -o weights=weights/yolov3_darknet53_270e_coco.pdparams --export_serving_model=True +``` + +以上命令会在`output_inference/`文件夹下生成一个`yolov3_darknet53_270e_coco`文件夹: +``` +output_inference +│ ├── yolov3_darknet53_270e_coco +│ │ ├── infer_cfg.yml +│ │ ├── model.pdiparams +│ │ ├── model.pdiparams.info +│ │ ├── model.pdmodel +│ │ ├── serving_client +│ │ │ ├── serving_client_conf.prototxt +│ │ │ ├── serving_client_conf.stream.prototxt +│ │ ├── serving_server +│ │ │ ├── __model__ +│ │ │ ├── __params__ +│ │ │ ├── serving_server_conf.prototxt +│ │ │ ├── serving_server_conf.stream.prototxt +│ │ │ ├── ... +``` + +`serving_client`文件夹下`serving_client_conf.prototxt`详细说明了模型输入输出信息 +`serving_client_conf.prototxt`文件内容为: +``` +lient_conf.prototxt +feed_var { + name: "im_shape" + alias_name: "im_shape" + is_lod_tensor: false + feed_type: 1 + shape: 2 +} +feed_var { + name: "image" + alias_name: "image" + is_lod_tensor: false + feed_type: 1 + shape: 3 + shape: 608 + shape: 608 +} +feed_var { + name: "scale_factor" + alias_name: "scale_factor" + is_lod_tensor: false + feed_type: 1 + shape: 2 +} +fetch_var { + name: "save_infer_model/scale_0.tmp_1" + alias_name: "save_infer_model/scale_0.tmp_1" + is_lod_tensor: true + fetch_type: 1 + shape: -1 +} +fetch_var { + name: "save_infer_model/scale_1.tmp_1" + alias_name: "save_infer_model/scale_1.tmp_1" + is_lod_tensor: true + fetch_type: 2 + shape: -1 +} +``` + +## 4. 启动PaddleServing服务 + +``` +cd output_inference/yolov3_darknet53_270e_coco/ + +# GPU +python -m paddle_serving_server_gpu.serve --model serving_server --port 9393 --gpu_ids 0 + +# CPU +python -m paddle_serving_server.serve --model serving_server --port 9393 +``` + +## 5. 测试部署的服务 +准备`label_list.txt`文件 +``` +# 进入到导出模型文件夹 +cd output_inference/yolov3_darknet53_270e_coco/ + +# 将数据集对应的label_list.txt文件放到当前文件夹下 +``` + +设置`prototxt`文件路径为`serving_client/serving_client_conf.prototxt` 。 +设置`fetch`为`fetch=["save_infer_model/scale_0.tmp_1"])` + +测试 +``` +# 进入目录 +cd output_inference/yolov3_darknet53_270e_coco/ + +# 测试代码 test_client.py 会自动创建output文件夹,并在output下生成`bbox.json`和`000000014439.jpg`两个文件 +python ../../deploy/serving/test_client.py ../../demo/000000014439.jpg +``` diff --git a/deploy/serving/test_client.py b/deploy/serving/test_client.py new file mode 100644 index 0000000..7c2a639 --- /dev/null +++ b/deploy/serving/test_client.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import numpy as np +from paddle_serving_client import Client +from paddle_serving_app.reader import * +import cv2 +preprocess = Sequential([ + File2Image(), BGR2RGB(), Resize( + (608, 608), interpolation=cv2.INTER_LINEAR), Div(255.0), Transpose( + (2, 0, 1)) +]) + +postprocess = RCNNPostprocess("label_list.txt", "output", [608, 608]) +client = Client() + +client.load_client_config("serving_client/serving_client_conf.prototxt") +client.connect(['127.0.0.1:9393']) + +im = preprocess(sys.argv[1]) +fetch_map = client.predict( + feed={ + "image": im, + "im_size": np.array(list(im.shape[1:])), + }, + fetch=["multiclass_nms_0.tmp_0"]) +fetch_map["image"] = sys.argv[1] +postprocess(fetch_map) diff --git a/ppdet/__init__.py b/ppdet/__init__.py new file mode 100644 index 0000000..56b687d --- /dev/null +++ b/ppdet/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import (core, data, engine, modeling, model_zoo, optimizer, metrics, + utils, slim) diff --git a/ppdet/__pycache__/__init__.cpython-38.pyc b/ppdet/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..6e950d5 Binary files /dev/null and b/ppdet/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/__pycache__/__init__.cpython-39.pyc b/ppdet/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..898870c Binary files /dev/null and b/ppdet/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/__pycache__/optimizer.cpython-38.pyc b/ppdet/__pycache__/optimizer.cpython-38.pyc new file mode 100644 index 0000000..cc3b1c7 Binary files /dev/null and b/ppdet/__pycache__/optimizer.cpython-38.pyc differ diff --git a/ppdet/__pycache__/optimizer.cpython-39.pyc b/ppdet/__pycache__/optimizer.cpython-39.pyc new file mode 100644 index 0000000..b4313ed Binary files /dev/null and b/ppdet/__pycache__/optimizer.cpython-39.pyc differ diff --git a/ppdet/core/__init__.py b/ppdet/core/__init__.py new file mode 100644 index 0000000..d042771 --- /dev/null +++ b/ppdet/core/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import config diff --git a/ppdet/core/__pycache__/__init__.cpython-38.pyc b/ppdet/core/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..82b5e96 Binary files /dev/null and b/ppdet/core/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/core/__pycache__/__init__.cpython-39.pyc b/ppdet/core/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..20e2e7c Binary files /dev/null and b/ppdet/core/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/core/__pycache__/workspace.cpython-38.pyc b/ppdet/core/__pycache__/workspace.cpython-38.pyc new file mode 100644 index 0000000..0e805d2 Binary files /dev/null and b/ppdet/core/__pycache__/workspace.cpython-38.pyc differ diff --git a/ppdet/core/__pycache__/workspace.cpython-39.pyc b/ppdet/core/__pycache__/workspace.cpython-39.pyc new file mode 100644 index 0000000..72b24db Binary files /dev/null and b/ppdet/core/__pycache__/workspace.cpython-39.pyc differ diff --git a/ppdet/core/config/__init__.py b/ppdet/core/config/__init__.py new file mode 100644 index 0000000..d0c32e2 --- /dev/null +++ b/ppdet/core/config/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/core/config/__pycache__/__init__.cpython-38.pyc b/ppdet/core/config/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..56fb491 Binary files /dev/null and b/ppdet/core/config/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/core/config/__pycache__/__init__.cpython-39.pyc b/ppdet/core/config/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..c794c22 Binary files /dev/null and b/ppdet/core/config/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/core/config/__pycache__/schema.cpython-38.pyc b/ppdet/core/config/__pycache__/schema.cpython-38.pyc new file mode 100644 index 0000000..4082b69 Binary files /dev/null and b/ppdet/core/config/__pycache__/schema.cpython-38.pyc differ diff --git a/ppdet/core/config/__pycache__/schema.cpython-39.pyc b/ppdet/core/config/__pycache__/schema.cpython-39.pyc new file mode 100644 index 0000000..25ecb73 Binary files /dev/null and b/ppdet/core/config/__pycache__/schema.cpython-39.pyc differ diff --git a/ppdet/core/config/__pycache__/yaml_helpers.cpython-38.pyc b/ppdet/core/config/__pycache__/yaml_helpers.cpython-38.pyc new file mode 100644 index 0000000..119382a Binary files /dev/null and b/ppdet/core/config/__pycache__/yaml_helpers.cpython-38.pyc differ diff --git a/ppdet/core/config/__pycache__/yaml_helpers.cpython-39.pyc b/ppdet/core/config/__pycache__/yaml_helpers.cpython-39.pyc new file mode 100644 index 0000000..25459a0 Binary files /dev/null and b/ppdet/core/config/__pycache__/yaml_helpers.cpython-39.pyc differ diff --git a/ppdet/core/config/schema.py b/ppdet/core/config/schema.py new file mode 100644 index 0000000..0d2b0da --- /dev/null +++ b/ppdet/core/config/schema.py @@ -0,0 +1,248 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import inspect +import importlib +import re + +try: + from docstring_parser import parse as doc_parse +except Exception: + + def doc_parse(*args): + pass + + +try: + from typeguard import check_type +except Exception: + + def check_type(*args): + pass + + +__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema'] + + +class SchemaValue(object): + def __init__(self, name, doc='', type=None): + super(SchemaValue, self).__init__() + self.name = name + self.doc = doc + self.type = type + + def set_default(self, value): + self.default = value + + def has_default(self): + return hasattr(self, 'default') + + +class SchemaDict(dict): + def __init__(self, **kwargs): + super(SchemaDict, self).__init__() + self.schema = {} + self.strict = False + self.doc = "" + self.update(kwargs) + + def __setitem__(self, key, value): + # XXX also update regular dict to SchemaDict?? + if isinstance(value, dict) and key in self and isinstance(self[key], + SchemaDict): + self[key].update(value) + else: + super(SchemaDict, self).__setitem__(key, value) + + def __missing__(self, key): + if self.has_default(key): + return self.schema[key].default + elif key in self.schema: + return self.schema[key] + else: + raise KeyError(key) + + def copy(self): + newone = SchemaDict() + newone.__dict__.update(self.__dict__) + newone.update(self) + return newone + + def set_schema(self, key, value): + assert isinstance(value, SchemaValue) + self.schema[key] = value + + def set_strict(self, strict): + self.strict = strict + + def has_default(self, key): + return key in self.schema and self.schema[key].has_default() + + def is_default(self, key): + if not self.has_default(key): + return False + if hasattr(self[key], '__dict__'): + return True + else: + return key not in self or self[key] == self.schema[key].default + + def find_default_keys(self): + return [ + k for k in list(self.keys()) + list(self.schema.keys()) + if self.is_default(k) + ] + + def mandatory(self): + return any([k for k in self.schema.keys() if not self.has_default(k)]) + + def find_missing_keys(self): + missing = [ + k for k in self.schema.keys() + if k not in self and not self.has_default(k) + ] + placeholders = [k for k in self if self[k] in ('', '')] + return missing + placeholders + + def find_extra_keys(self): + return list(set(self.keys()) - set(self.schema.keys())) + + def find_mismatch_keys(self): + mismatch_keys = [] + for arg in self.schema.values(): + if arg.type is not None: + try: + check_type("{}.{}".format(self.name, arg.name), + self[arg.name], arg.type) + except Exception: + mismatch_keys.append(arg.name) + return mismatch_keys + + def validate(self): + missing_keys = self.find_missing_keys() + if missing_keys: + raise ValueError("Missing param for class<{}>: {}".format( + self.name, ", ".join(missing_keys))) + extra_keys = self.find_extra_keys() + if extra_keys and self.strict: + raise ValueError("Extraneous param for class<{}>: {}".format( + self.name, ", ".join(extra_keys))) + mismatch_keys = self.find_mismatch_keys() + if mismatch_keys: + raise TypeError("Wrong param type for class<{}>: {}".format( + self.name, ", ".join(mismatch_keys))) + + +class SharedConfig(object): + """ + Representation class for `__shared__` annotations, which work as follows: + + - if `key` is set for the module in config file, its value will take + precedence + - if `key` is not set for the module but present in the config file, its + value will be used + - otherwise, use the provided `default_value` as fallback + + Args: + key: config[key] will be injected + default_value: fallback value + """ + + def __init__(self, key, default_value=None): + super(SharedConfig, self).__init__() + self.key = key + self.default_value = default_value + + +def extract_schema(cls): + """ + Extract schema from a given class + + Args: + cls (type): Class from which to extract. + + Returns: + schema (SchemaDict): Extracted schema. + """ + ctor = cls.__init__ + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(ctor) + annotations = argspec.annotations + has_kwargs = argspec.varkw is not None + else: + argspec = inspect.getargspec(ctor) + # python 2 type hinting workaround, see pep-3107 + # however, since `typeguard` does not support python 2, type checking + # is still python 3 only for now + annotations = getattr(ctor, '__annotations__', {}) + has_kwargs = argspec.keywords is not None + + names = [arg for arg in argspec.args if arg != 'self'] + defaults = argspec.defaults + num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0 + num_required = len(names) - num_defaults + + docs = cls.__doc__ + if docs is None and getattr(cls, '__category__', None) == 'op': + docs = cls.__call__.__doc__ + try: + docstring = doc_parse(docs) + except Exception: + docstring = None + + if docstring is None: + comments = {} + else: + comments = {} + for p in docstring.params: + match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name) + if match_obj is not None: + comments[match_obj.group(1)] = p.description + + schema = SchemaDict() + schema.name = cls.__name__ + schema.doc = "" + if docs is not None: + start_pos = docs[0] == '\n' and 1 or 0 + schema.doc = docs[start_pos:].split("\n")[0].strip() + # XXX handle paddle's weird doc convention + if '**' == schema.doc[:2] and '**' == schema.doc[-2:]: + schema.doc = schema.doc[2:-2].strip() + schema.category = hasattr(cls, '__category__') and getattr( + cls, '__category__') or 'module' + schema.strict = not has_kwargs + schema.pymodule = importlib.import_module(cls.__module__) + schema.inject = getattr(cls, '__inject__', []) + schema.shared = getattr(cls, '__shared__', []) + for idx, name in enumerate(names): + comment = name in comments and comments[name] or name + if name in schema.inject: + type_ = None + else: + type_ = name in annotations and annotations[name] or None + value_schema = SchemaValue(name, comment, type_) + if name in schema.shared: + assert idx >= num_required, "shared config must have default value" + default = defaults[idx - num_required] + value_schema.set_default(SharedConfig(name, default)) + elif idx >= num_required: + default = defaults[idx - num_required] + value_schema.set_default(default) + schema.set_schema(name, value_schema) + + return schema diff --git a/ppdet/core/config/yaml_helpers.py b/ppdet/core/config/yaml_helpers.py new file mode 100644 index 0000000..1545b6b --- /dev/null +++ b/ppdet/core/config/yaml_helpers.py @@ -0,0 +1,118 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect + +import yaml +from .schema import SharedConfig + +__all__ = ['serializable', 'Callable'] + + +def represent_dictionary_order(self, dict_data): + return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items()) + + +def setup_orderdict(): + from collections import OrderedDict + yaml.add_representer(OrderedDict, represent_dictionary_order) + + +def _make_python_constructor(cls): + def python_constructor(loader, node): + if isinstance(node, yaml.SequenceNode): + args = loader.construct_sequence(node, deep=True) + return cls(*args) + else: + kwargs = loader.construct_mapping(node, deep=True) + try: + return cls(**kwargs) + except Exception as ex: + print("Error when construct {} instance from yaml config". + format(cls.__name__)) + raise ex + + return python_constructor + + +def _make_python_representer(cls): + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(cls) + else: + argspec = inspect.getargspec(cls.__init__) + argnames = [arg for arg in argspec.args if arg != 'self'] + + def python_representer(dumper, obj): + if argnames: + data = {name: getattr(obj, name) for name in argnames} + else: + data = obj.__dict__ + if '_id' in data: + del data['_id'] + return dumper.represent_mapping(u'!{}'.format(cls.__name__), data) + + return python_representer + + +def serializable(cls): + """ + Add loader and dumper for given class, which must be + "trivially serializable" + + Args: + cls: class to be serialized + + Returns: cls + """ + yaml.add_constructor(u'!{}'.format(cls.__name__), + _make_python_constructor(cls)) + yaml.add_representer(cls, _make_python_representer(cls)) + return cls + + +yaml.add_representer(SharedConfig, + lambda d, o: d.represent_data(o.default_value)) + + +@serializable +class Callable(object): + """ + Helper to be used in Yaml for creating arbitrary class objects + + Args: + full_type (str): the full module path to target function + """ + + def __init__(self, full_type, args=[], kwargs={}): + super(Callable, self).__init__() + self.full_type = full_type + self.args = args + self.kwargs = kwargs + + def __call__(self): + if '.' in self.full_type: + idx = self.full_type.rfind('.') + module = importlib.import_module(self.full_type[:idx]) + func_name = self.full_type[idx + 1:] + else: + try: + module = importlib.import_module('builtins') + except Exception: + module = importlib.import_module('__builtin__') + func_name = self.full_type + + func = getattr(module, func_name) + return func(*self.args, **self.kwargs) diff --git a/ppdet/core/workspace.py b/ppdet/core/workspace.py new file mode 100644 index 0000000..5d6a5d9 --- /dev/null +++ b/ppdet/core/workspace.py @@ -0,0 +1,276 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import importlib +import os +import sys + +import yaml +import copy +import collections + +try: + collectionsAbc = collections.abc +except AttributeError: + collectionsAbc = collections + +from .config.schema import SchemaDict, SharedConfig, extract_schema +from .config.yaml_helpers import serializable + +__all__ = [ + 'global_config', + 'load_config', + 'merge_config', + 'get_registered_modules', + 'create', + 'register', + 'serializable', + 'dump_value', +] + + +def dump_value(value): + # XXX this is hackish, but collections.abc is not available in python 2 + if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)): + value = yaml.dump(value, default_flow_style=True) + value = value.replace('\n', '') + value = value.replace('...', '') + return "'{}'".format(value) + else: + # primitive types + return str(value) + + +class AttrDict(dict): + """Single level attribute dict, NOT recursive""" + + def __init__(self, **kwargs): + super(AttrDict, self).__init__() + super(AttrDict, self).update(kwargs) + + def __getattr__(self, key): + if key in self: + return self[key] + raise AttributeError("object has no attribute '{}'".format(key)) + + +global_config = AttrDict() + +BASE_KEY = '_BASE_' + + +# parse and load _BASE_ recursively +def _load_config_with_base(file_path): + with open(file_path) as f: + file_cfg = yaml.load(f, Loader=yaml.Loader) + + # NOTE: cfgs outside have higher priority than cfgs in _BASE_ + if BASE_KEY in file_cfg: + all_base_cfg = AttrDict() + base_ymls = list(file_cfg[BASE_KEY]) + for base_yml in base_ymls: + if base_yml.startswith("~"): + base_yml = os.path.expanduser(base_yml) + if not base_yml.startswith('/'): + base_yml = os.path.join(os.path.dirname(file_path), base_yml) + + with open(base_yml) as f: + base_cfg = _load_config_with_base(base_yml) + all_base_cfg = merge_config(base_cfg, all_base_cfg) + + del file_cfg[BASE_KEY] + return merge_config(file_cfg, all_base_cfg) + + return file_cfg + + +def load_config(file_path): + """ + Load config from file. + + Args: + file_path (str): Path of the config file to be loaded. + + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + + # load config from file and merge into global config + cfg = _load_config_with_base(file_path) + cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0] + merge_config(cfg) + + return global_config + + +def dict_merge(dct, merge_dct): + """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of + updating only top-level keys, dict_merge recurses down into dicts nested + to an arbitrary depth, updating keys. The ``merge_dct`` is merged into + ``dct``. + + Args: + dct: dict onto which the merge is executed + merge_dct: dct merged into dct + + Returns: dct + """ + for k, v in merge_dct.items(): + if (k in dct and isinstance(dct[k], dict) and + isinstance(merge_dct[k], collectionsAbc.Mapping)): + dict_merge(dct[k], merge_dct[k]) + else: + dct[k] = merge_dct[k] + return dct + + +def merge_config(config, another_cfg=None): + """ + Merge config into global config or another_cfg. + + Args: + config (dict): Config to be merged. + + Returns: global config + """ + global global_config + dct = another_cfg or global_config + return dict_merge(dct, config) + + +def get_registered_modules(): + return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)} + + +def make_partial(cls): + op_module = importlib.import_module(cls.__op__.__module__) + op = getattr(op_module, cls.__op__.__name__) + cls.__category__ = getattr(cls, '__category__', None) or 'op' + + def partial_apply(self, *args, **kwargs): + kwargs_ = self.__dict__.copy() + kwargs_.update(kwargs) + return op(*args, **kwargs_) + + if getattr(cls, '__append_doc__', True): # XXX should default to True? + if sys.version_info[0] > 2: + cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__) + cls.__init__.__doc__ = op.__doc__ + cls.__call__ = partial_apply + cls.__call__.__doc__ = op.__doc__ + else: + # XXX work around for python 2 + partial_apply.__doc__ = op.__doc__ + cls.__call__ = partial_apply + return cls + + +def register(cls): + """ + Register a given module class. + + Args: + cls (type): Module class to be registered. + + Returns: cls + """ + if cls.__name__ in global_config: + raise ValueError("Module class already registered: {}".format( + cls.__name__)) + if hasattr(cls, '__op__'): + cls = make_partial(cls) + global_config[cls.__name__] = extract_schema(cls) + return cls + + +def create(cls_or_name, **kwargs): + """ + Create an instance of given module class. + + Args: + cls_or_name (type or str): Class of which to create instance. + + Returns: instance of type `cls_or_name` + """ + assert type(cls_or_name) in [type, str + ], "should be a class or name of a class" + name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ + assert name in global_config and \ + isinstance(global_config[name], SchemaDict), \ + "the module {} is not registered".format(name) + config = global_config[name] + cls = getattr(config.pymodule, name) + cls_kwargs = {} + cls_kwargs.update(global_config[name]) + + # parse `shared` annoation of registered modules + if getattr(config, 'shared', None): + for k in config.shared: + target_key = config[k] + shared_conf = config.schema[k].default + assert isinstance(shared_conf, SharedConfig) + if target_key is not None and not isinstance(target_key, + SharedConfig): + continue # value is given for the module + elif shared_conf.key in global_config: + # `key` is present in config + cls_kwargs[k] = global_config[shared_conf.key] + else: + cls_kwargs[k] = shared_conf.default_value + + # parse `inject` annoation of registered modules + if getattr(cls, 'from_config', None): + cls_kwargs.update(cls.from_config(config, **kwargs)) + + if getattr(config, 'inject', None): + for k in config.inject: + target_key = config[k] + # optional dependency + if target_key is None: + continue + + if isinstance(target_key, dict) or hasattr(target_key, '__dict__'): + if 'name' not in target_key.keys(): + continue + inject_name = str(target_key['name']) + if inject_name not in global_config: + raise ValueError( + "Missing injection name {} and check it's name in cfg file". + format(k)) + target = global_config[inject_name] + for i, v in target_key.items(): + if i == 'name': + continue + target[i] = v + if isinstance(target, SchemaDict): + cls_kwargs[k] = create(inject_name) + elif isinstance(target_key, str): + if target_key not in global_config: + raise ValueError("Missing injection config:", target_key) + target = global_config[target_key] + if isinstance(target, SchemaDict): + cls_kwargs[k] = create(target_key) + elif hasattr(target, '__dict__'): # serialized object + cls_kwargs[k] = target + else: + raise ValueError("Unsupported injection type:", target_key) + # prevent modification of global config values of reference types + # (e.g., list, dict) from within the created module instances + #kwargs = copy.deepcopy(kwargs) + return cls(**cls_kwargs) diff --git a/ppdet/data/__init__.py b/ppdet/data/__init__.py new file mode 100644 index 0000000..a12aa32 --- /dev/null +++ b/ppdet/data/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import source +from . import transform +from . import reader + +from .source import * +from .transform import * +from .reader import * diff --git a/ppdet/data/__pycache__/__init__.cpython-38.pyc b/ppdet/data/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..918d2de Binary files /dev/null and b/ppdet/data/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/data/__pycache__/__init__.cpython-39.pyc b/ppdet/data/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..3592826 Binary files /dev/null and b/ppdet/data/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/data/__pycache__/reader.cpython-38.pyc b/ppdet/data/__pycache__/reader.cpython-38.pyc new file mode 100644 index 0000000..f15956b Binary files /dev/null and b/ppdet/data/__pycache__/reader.cpython-38.pyc differ diff --git a/ppdet/data/__pycache__/reader.cpython-39.pyc b/ppdet/data/__pycache__/reader.cpython-39.pyc new file mode 100644 index 0000000..a565b1f Binary files /dev/null and b/ppdet/data/__pycache__/reader.cpython-39.pyc differ diff --git a/ppdet/data/__pycache__/shm_utils.cpython-38.pyc b/ppdet/data/__pycache__/shm_utils.cpython-38.pyc new file mode 100644 index 0000000..9af0b32 Binary files /dev/null and b/ppdet/data/__pycache__/shm_utils.cpython-38.pyc differ diff --git a/ppdet/data/__pycache__/shm_utils.cpython-39.pyc b/ppdet/data/__pycache__/shm_utils.cpython-39.pyc new file mode 100644 index 0000000..5f2cdde Binary files /dev/null and b/ppdet/data/__pycache__/shm_utils.cpython-39.pyc differ diff --git a/ppdet/data/reader.py b/ppdet/data/reader.py new file mode 100644 index 0000000..bc34ec5 --- /dev/null +++ b/ppdet/data/reader.py @@ -0,0 +1,281 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +import traceback +import six +import sys +import multiprocessing as mp +if sys.version_info >= (3, 0): + import queue as Queue +else: + import Queue +import numpy as np + +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler + +from ppdet.core.workspace import register, serializable, create +from . import transform +from .shm_utils import _get_shared_memory_size_in_M + +from ppdet.utils.logger import setup_logger +logger = setup_logger('reader') + +MAIN_PID = os.getpid() + + +class Compose(object): + def __init__(self, transforms, num_classes=80): + self.transforms = transforms + self.transforms_cls = [] + for t in self.transforms: + for k, v in t.items(): + op_cls = getattr(transform, k) + f = op_cls(**v) + if hasattr(f, 'num_classes'): + f.num_classes = num_classes + + self.transforms_cls.append(f) + + def __call__(self, data): + for f in self.transforms_cls: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.warn("fail to map op [{}] with error: {} and stack:\n{}". + format(f, e, str(stack_info))) + raise e + + return data + + +class BatchCompose(Compose): + def __init__(self, transforms, num_classes=80): + super(BatchCompose, self).__init__(transforms, num_classes) + self.output_fields = mp.Manager().list([]) + self.lock = mp.Lock() + + def __call__(self, data): + for f in self.transforms_cls: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.warn("fail to map op [{}] with error: {} and stack:\n{}". + format(f, e, str(stack_info))) + raise e + + # accessing ListProxy in main process (no worker subprocess) + # may incur errors in some enviroments, ListProxy back to + # list if no worker process start, while this `__call__` + # will be called in main process + global MAIN_PID + if os.getpid() == MAIN_PID and \ + isinstance(self.output_fields, mp.managers.ListProxy): + self.output_fields = [] + + # parse output fields by first sample + # **this shoule be fixed if paddle.io.DataLoader support** + # For paddle.io.DataLoader not support dict currently, + # we need to parse the key from the first sample, + # BatchCompose.__call__ will be called in each worker + # process, so lock is need here. + if len(self.output_fields) == 0: + self.lock.acquire() + if len(self.output_fields) == 0: + for k, v in data[0].items(): + # FIXME(dkp): for more elegent coding + if k not in ['flipped', 'h', 'w']: + self.output_fields.append(k) + self.lock.release() + + data = [[data[i][k] for k in self.output_fields] + for i in range(len(data))] + data = list(zip(*data)) + + batch_data = [np.stack(d, axis=0) for d in data] + return batch_data + + +class BaseDataLoader(object): + """ + Base DataLoader implementation for detection models + + Args: + sample_transforms (list): a list of transforms to perform + on each sample + batch_transforms (list): a list of transforms to perform + on batch + batch_size (int): batch size for batch collating, default 1. + shuffle (bool): whether to shuffle samples + drop_last (bool): whether to drop the last incomplete, + default False + drop_empty (bool): whether to drop samples with no ground + truth labels, default True + num_classes (int): class number of dataset, default 80 + use_shared_memory (bool): whether to use shared memory to + accelerate data loading, enable this only if you + are sure that the shared memory size of your OS + is larger than memory cost of input datas of model. + Note that shared memory will be automatically + disabled if the shared memory of OS is less than + 1G, which is not enough for detection models. + Default False. + """ + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + drop_empty=True, + num_classes=80, + use_shared_memory=False, + **kwargs): + # sample transform + self._sample_transforms = Compose( + sample_transforms, num_classes=num_classes) + + # batch transfrom + self._batch_transforms = BatchCompose(batch_transforms, num_classes) + + self.batch_size = batch_size + self.shuffle = shuffle + self.drop_last = drop_last + self.use_shared_memory = use_shared_memory + self.kwargs = kwargs + + def __call__(self, + dataset, + worker_num, + batch_sampler=None, + return_list=False): + self.dataset = dataset + self.dataset.check_or_download_dataset() + self.dataset.parse_dataset() + # get data + self.dataset.set_transform(self._sample_transforms) + # set kwargs + self.dataset.set_kwargs(**self.kwargs) + # batch sampler + if batch_sampler is None: + self._batch_sampler = DistributedBatchSampler( + self.dataset, + batch_size=self.batch_size, + shuffle=self.shuffle, + drop_last=self.drop_last) + else: + self._batch_sampler = batch_sampler + + use_shared_memory = self.use_shared_memory + # check whether shared memory size is bigger than 1G(1024M) + if use_shared_memory: + shm_size = _get_shared_memory_size_in_M() + if shm_size is not None and shm_size < 1024.: + logger.warn("Shared memory size is less than 1G, " + "disable shared_memory in DataLoader") + use_shared_memory = False + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_sampler=self._batch_sampler, + collate_fn=self._batch_transforms, + num_workers=worker_num, + return_list=return_list, + use_shared_memory=use_shared_memory) + self.loader = iter(self.dataloader) + + return self + + def __len__(self): + return len(self._batch_sampler) + + def __iter__(self): + return self + + def __next__(self): + # pack {filed_name: field_data} here + # looking forward to support dictionary + # data structure in paddle.io.DataLoader + try: + data = next(self.loader) + return { + k: v + for k, v in zip(self._batch_transforms.output_fields, data) + } + except StopIteration: + self.loader = iter(self.dataloader) + six.reraise(*sys.exc_info()) + + def next(self): + # python2 compatibility + return self.__next__() + + +@register +class TrainReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=True, + drop_last=True, + drop_empty=True, + num_classes=80, + **kwargs): + super(TrainReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + drop_empty, num_classes, **kwargs) + + +@register +class EvalReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=True, + drop_empty=True, + num_classes=80, + **kwargs): + super(EvalReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + drop_empty, num_classes, **kwargs) + + +@register +class TestReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + drop_empty=True, + num_classes=80, + **kwargs): + super(TestReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + drop_empty, num_classes, **kwargs) diff --git a/ppdet/data/shm_utils.py b/ppdet/data/shm_utils.py new file mode 100644 index 0000000..67a3962 --- /dev/null +++ b/ppdet/data/shm_utils.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +SIZE_UNIT = ['K', 'M', 'G', 'T'] +SHM_QUERY_CMD = 'df -h' +SHM_KEY = 'shm' +SHM_DEFAULT_MOUNT = '/dev/shm' + +# [ shared memory size check ] +# In detection models, image/target data occupies a lot of memory, and +# will occupy lots of shared memory in multi-process DataLoader, we use +# following code to get shared memory size and perform a size check to +# disable shared memory use if shared memory size is not enough. +# Shared memory getting process as follows: +# 1. use `df -h` get all mount info +# 2. pick up spaces whose mount info contains 'shm' +# 3. if 'shm' space number is only 1, return its size +# 4. if there are multiple 'shm' space, try to find the default mount +# directory '/dev/shm' is Linux-like system, otherwise return the +# biggest space size. + + +def _parse_size_in_M(size_str): + num, unit = size_str[:-1], size_str[-1] + assert unit in SIZE_UNIT, \ + "unknown shm size unit {}".format(unit) + return float(num) * \ + (1024 ** (SIZE_UNIT.index(unit) - 1)) + + +def _get_shared_memory_size_in_M(): + try: + df_infos = os.popen(SHM_QUERY_CMD).readlines() + except: + return None + else: + shm_infos = [] + for df_info in df_infos: + info = df_info.strip() + if info.find(SHM_KEY) >= 0: + shm_infos.append(info.split()) + + if len(shm_infos) == 0: + return None + elif len(shm_infos) == 1: + return _parse_size_in_M(shm_infos[0][3]) + else: + shm_infos = [si for si in shm_infos \ + if si[-1] == SHM_DEFAULT_MOUNT] + if len(shm_infos) == 0: + return _parse_size_in_M(shm_infos[0][3]) + else: + return max([_parse_size_in_M(si[3]) \ + for si in shm_infos]) diff --git a/ppdet/data/source/__init__.py b/ppdet/data/source/__init__.py new file mode 100644 index 0000000..b63cba0 --- /dev/null +++ b/ppdet/data/source/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import coco +from . import voc +from . import widerface +from . import category + +from .coco import * +from .voc import * +from .widerface import * +from .category import * diff --git a/ppdet/data/source/__pycache__/__init__.cpython-38.pyc b/ppdet/data/source/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..848e6c1 Binary files /dev/null and b/ppdet/data/source/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/__init__.cpython-39.pyc b/ppdet/data/source/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..13c62bc Binary files /dev/null and b/ppdet/data/source/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/data/source/__pycache__/category.cpython-38.pyc b/ppdet/data/source/__pycache__/category.cpython-38.pyc new file mode 100644 index 0000000..aaa1dab Binary files /dev/null and b/ppdet/data/source/__pycache__/category.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/category.cpython-39.pyc b/ppdet/data/source/__pycache__/category.cpython-39.pyc new file mode 100644 index 0000000..7ba3a11 Binary files /dev/null and b/ppdet/data/source/__pycache__/category.cpython-39.pyc differ diff --git a/ppdet/data/source/__pycache__/coco.cpython-38.pyc b/ppdet/data/source/__pycache__/coco.cpython-38.pyc new file mode 100644 index 0000000..273836b Binary files /dev/null and b/ppdet/data/source/__pycache__/coco.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/coco.cpython-39.pyc b/ppdet/data/source/__pycache__/coco.cpython-39.pyc new file mode 100644 index 0000000..7917cfa Binary files /dev/null and b/ppdet/data/source/__pycache__/coco.cpython-39.pyc differ diff --git a/ppdet/data/source/__pycache__/dataset.cpython-38.pyc b/ppdet/data/source/__pycache__/dataset.cpython-38.pyc new file mode 100644 index 0000000..96c19c2 Binary files /dev/null and b/ppdet/data/source/__pycache__/dataset.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/dataset.cpython-39.pyc b/ppdet/data/source/__pycache__/dataset.cpython-39.pyc new file mode 100644 index 0000000..ed90bc5 Binary files /dev/null and b/ppdet/data/source/__pycache__/dataset.cpython-39.pyc differ diff --git a/ppdet/data/source/__pycache__/voc.cpython-38.pyc b/ppdet/data/source/__pycache__/voc.cpython-38.pyc new file mode 100644 index 0000000..800c58b Binary files /dev/null and b/ppdet/data/source/__pycache__/voc.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/voc.cpython-39.pyc b/ppdet/data/source/__pycache__/voc.cpython-39.pyc new file mode 100644 index 0000000..9a25fdd Binary files /dev/null and b/ppdet/data/source/__pycache__/voc.cpython-39.pyc differ diff --git a/ppdet/data/source/__pycache__/widerface.cpython-38.pyc b/ppdet/data/source/__pycache__/widerface.cpython-38.pyc new file mode 100644 index 0000000..477a947 Binary files /dev/null and b/ppdet/data/source/__pycache__/widerface.cpython-38.pyc differ diff --git a/ppdet/data/source/__pycache__/widerface.cpython-39.pyc b/ppdet/data/source/__pycache__/widerface.cpython-39.pyc new file mode 100644 index 0000000..79e3d16 Binary files /dev/null and b/ppdet/data/source/__pycache__/widerface.cpython-39.pyc differ diff --git a/ppdet/data/source/category.py b/ppdet/data/source/category.py new file mode 100644 index 0000000..06fbccc --- /dev/null +++ b/ppdet/data/source/category.py @@ -0,0 +1,800 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from ppdet.data.source.voc import pascalvoc_label +from ppdet.data.source.widerface import widerface_label +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['get_categories'] + + +def get_categories(metric_type, anno_file=None): + """ + Get class id to category id map and category id + to category name map from annotation file. + + Args: + metric_type (str): metric type, currently support 'coco', 'voc', 'oid' + and 'widerface'. + anno_file (str): annotation file path + """ + if metric_type.lower() == 'coco': + if anno_file and os.path.isfile(anno_file): + # lazy import pycocotools here + from pycocotools.coco import COCO + + coco = COCO(anno_file) + cats = coco.loadCats(coco.getCatIds()) + + clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} + catid2name = {cat['id']: cat['name'] for cat in cats} + + return clsid2catid, catid2name + + # anno file not exist, load default categories of COCO17 + else: + return _coco17_category() + + elif metric_type.lower() == 'voc': + if anno_file and os.path.isfile(anno_file): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + + if cats[0] == 'background': + cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + # anno file not exist, load default categories of + # VOC all 20 categories + else: + return _vocall_category() + + elif metric_type.lower() == 'oid': + if anno_file and os.path.isfile(anno_file): + logger.warn("only default categories support for OID19") + return _oid19_category() + + elif metric_type.lower() == 'widerface': + return _widerface_category() + + else: + raise ValueError("unknown metric type {}".format(metric_type)) + + +def _coco17_category(): + """ + Get class id to category id map and category id + to category name map of COCO2017 dataset + + """ + clsid2catid = { + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 11: 11, + 12: 13, + 13: 14, + 14: 15, + 15: 16, + 16: 17, + 17: 18, + 18: 19, + 19: 20, + 20: 21, + 21: 22, + 22: 23, + 23: 24, + 24: 25, + 25: 27, + 26: 28, + 27: 31, + 28: 32, + 29: 33, + 30: 34, + 31: 35, + 32: 36, + 33: 37, + 34: 38, + 35: 39, + 36: 40, + 37: 41, + 38: 42, + 39: 43, + 40: 44, + 41: 46, + 42: 47, + 43: 48, + 44: 49, + 45: 50, + 46: 51, + 47: 52, + 48: 53, + 49: 54, + 50: 55, + 51: 56, + 52: 57, + 53: 58, + 54: 59, + 55: 60, + 56: 61, + 57: 62, + 58: 63, + 59: 64, + 60: 65, + 61: 67, + 62: 70, + 63: 72, + 64: 73, + 65: 74, + 66: 75, + 67: 76, + 68: 77, + 69: 78, + 70: 79, + 71: 80, + 72: 81, + 73: 82, + 74: 84, + 75: 85, + 76: 86, + 77: 87, + 78: 88, + 79: 89, + 80: 90 + } + + catid2name = { + 0: 'background', + 1: 'person', + 2: 'bicycle', + 3: 'car', + 4: 'motorcycle', + 5: 'airplane', + 6: 'bus', + 7: 'train', + 8: 'truck', + 9: 'boat', + 10: 'traffic light', + 11: 'fire hydrant', + 13: 'stop sign', + 14: 'parking meter', + 15: 'bench', + 16: 'bird', + 17: 'cat', + 18: 'dog', + 19: 'horse', + 20: 'sheep', + 21: 'cow', + 22: 'elephant', + 23: 'bear', + 24: 'zebra', + 25: 'giraffe', + 27: 'backpack', + 28: 'umbrella', + 31: 'handbag', + 32: 'tie', + 33: 'suitcase', + 34: 'frisbee', + 35: 'skis', + 36: 'snowboard', + 37: 'sports ball', + 38: 'kite', + 39: 'baseball bat', + 40: 'baseball glove', + 41: 'skateboard', + 42: 'surfboard', + 43: 'tennis racket', + 44: 'bottle', + 46: 'wine glass', + 47: 'cup', + 48: 'fork', + 49: 'knife', + 50: 'spoon', + 51: 'bowl', + 52: 'banana', + 53: 'apple', + 54: 'sandwich', + 55: 'orange', + 56: 'broccoli', + 57: 'carrot', + 58: 'hot dog', + 59: 'pizza', + 60: 'donut', + 61: 'cake', + 62: 'chair', + 63: 'couch', + 64: 'potted plant', + 65: 'bed', + 67: 'dining table', + 70: 'toilet', + 72: 'tv', + 73: 'laptop', + 74: 'mouse', + 75: 'remote', + 76: 'keyboard', + 77: 'cell phone', + 78: 'microwave', + 79: 'oven', + 80: 'toaster', + 81: 'sink', + 82: 'refrigerator', + 84: 'book', + 85: 'clock', + 86: 'vase', + 87: 'scissors', + 88: 'teddy bear', + 89: 'hair drier', + 90: 'toothbrush' + } + + clsid2catid = {k - 1: v for k, v in clsid2catid.items()} + catid2name.pop(0) + + return clsid2catid, catid2name + + +def _vocall_category(): + """ + Get class id to category id map and category id + to category name map of mixup voc dataset + + """ + label_map = pascalvoc_label() + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def _widerface_category(): + label_map = widerface_label() + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def _oid19_category(): + clsid2catid = {k: k + 1 for k in range(500)} + + catid2name = { + 0: "background", + 1: "Infant bed", + 2: "Rose", + 3: "Flag", + 4: "Flashlight", + 5: "Sea turtle", + 6: "Camera", + 7: "Animal", + 8: "Glove", + 9: "Crocodile", + 10: "Cattle", + 11: "House", + 12: "Guacamole", + 13: "Penguin", + 14: "Vehicle registration plate", + 15: "Bench", + 16: "Ladybug", + 17: "Human nose", + 18: "Watermelon", + 19: "Flute", + 20: "Butterfly", + 21: "Washing machine", + 22: "Raccoon", + 23: "Segway", + 24: "Taco", + 25: "Jellyfish", + 26: "Cake", + 27: "Pen", + 28: "Cannon", + 29: "Bread", + 30: "Tree", + 31: "Shellfish", + 32: "Bed", + 33: "Hamster", + 34: "Hat", + 35: "Toaster", + 36: "Sombrero", + 37: "Tiara", + 38: "Bowl", + 39: "Dragonfly", + 40: "Moths and butterflies", + 41: "Antelope", + 42: "Vegetable", + 43: "Torch", + 44: "Building", + 45: "Power plugs and sockets", + 46: "Blender", + 47: "Billiard table", + 48: "Cutting board", + 49: "Bronze sculpture", + 50: "Turtle", + 51: "Broccoli", + 52: "Tiger", + 53: "Mirror", + 54: "Bear", + 55: "Zucchini", + 56: "Dress", + 57: "Volleyball", + 58: "Guitar", + 59: "Reptile", + 60: "Golf cart", + 61: "Tart", + 62: "Fedora", + 63: "Carnivore", + 64: "Car", + 65: "Lighthouse", + 66: "Coffeemaker", + 67: "Food processor", + 68: "Truck", + 69: "Bookcase", + 70: "Surfboard", + 71: "Footwear", + 72: "Bench", + 73: "Necklace", + 74: "Flower", + 75: "Radish", + 76: "Marine mammal", + 77: "Frying pan", + 78: "Tap", + 79: "Peach", + 80: "Knife", + 81: "Handbag", + 82: "Laptop", + 83: "Tent", + 84: "Ambulance", + 85: "Christmas tree", + 86: "Eagle", + 87: "Limousine", + 88: "Kitchen & dining room table", + 89: "Polar bear", + 90: "Tower", + 91: "Football", + 92: "Willow", + 93: "Human head", + 94: "Stop sign", + 95: "Banana", + 96: "Mixer", + 97: "Binoculars", + 98: "Dessert", + 99: "Bee", + 100: "Chair", + 101: "Wood-burning stove", + 102: "Flowerpot", + 103: "Beaker", + 104: "Oyster", + 105: "Woodpecker", + 106: "Harp", + 107: "Bathtub", + 108: "Wall clock", + 109: "Sports uniform", + 110: "Rhinoceros", + 111: "Beehive", + 112: "Cupboard", + 113: "Chicken", + 114: "Man", + 115: "Blue jay", + 116: "Cucumber", + 117: "Balloon", + 118: "Kite", + 119: "Fireplace", + 120: "Lantern", + 121: "Missile", + 122: "Book", + 123: "Spoon", + 124: "Grapefruit", + 125: "Squirrel", + 126: "Orange", + 127: "Coat", + 128: "Punching bag", + 129: "Zebra", + 130: "Billboard", + 131: "Bicycle", + 132: "Door handle", + 133: "Mechanical fan", + 134: "Ring binder", + 135: "Table", + 136: "Parrot", + 137: "Sock", + 138: "Vase", + 139: "Weapon", + 140: "Shotgun", + 141: "Glasses", + 142: "Seahorse", + 143: "Belt", + 144: "Watercraft", + 145: "Window", + 146: "Giraffe", + 147: "Lion", + 148: "Tire", + 149: "Vehicle", + 150: "Canoe", + 151: "Tie", + 152: "Shelf", + 153: "Picture frame", + 154: "Printer", + 155: "Human leg", + 156: "Boat", + 157: "Slow cooker", + 158: "Croissant", + 159: "Candle", + 160: "Pancake", + 161: "Pillow", + 162: "Coin", + 163: "Stretcher", + 164: "Sandal", + 165: "Woman", + 166: "Stairs", + 167: "Harpsichord", + 168: "Stool", + 169: "Bus", + 170: "Suitcase", + 171: "Human mouth", + 172: "Juice", + 173: "Skull", + 174: "Door", + 175: "Violin", + 176: "Chopsticks", + 177: "Digital clock", + 178: "Sunflower", + 179: "Leopard", + 180: "Bell pepper", + 181: "Harbor seal", + 182: "Snake", + 183: "Sewing machine", + 184: "Goose", + 185: "Helicopter", + 186: "Seat belt", + 187: "Coffee cup", + 188: "Microwave oven", + 189: "Hot dog", + 190: "Countertop", + 191: "Serving tray", + 192: "Dog bed", + 193: "Beer", + 194: "Sunglasses", + 195: "Golf ball", + 196: "Waffle", + 197: "Palm tree", + 198: "Trumpet", + 199: "Ruler", + 200: "Helmet", + 201: "Ladder", + 202: "Office building", + 203: "Tablet computer", + 204: "Toilet paper", + 205: "Pomegranate", + 206: "Skirt", + 207: "Gas stove", + 208: "Cookie", + 209: "Cart", + 210: "Raven", + 211: "Egg", + 212: "Burrito", + 213: "Goat", + 214: "Kitchen knife", + 215: "Skateboard", + 216: "Salt and pepper shakers", + 217: "Lynx", + 218: "Boot", + 219: "Platter", + 220: "Ski", + 221: "Swimwear", + 222: "Swimming pool", + 223: "Drinking straw", + 224: "Wrench", + 225: "Drum", + 226: "Ant", + 227: "Human ear", + 228: "Headphones", + 229: "Fountain", + 230: "Bird", + 231: "Jeans", + 232: "Television", + 233: "Crab", + 234: "Microphone", + 235: "Home appliance", + 236: "Snowplow", + 237: "Beetle", + 238: "Artichoke", + 239: "Jet ski", + 240: "Stationary bicycle", + 241: "Human hair", + 242: "Brown bear", + 243: "Starfish", + 244: "Fork", + 245: "Lobster", + 246: "Corded phone", + 247: "Drink", + 248: "Saucer", + 249: "Carrot", + 250: "Insect", + 251: "Clock", + 252: "Castle", + 253: "Tennis racket", + 254: "Ceiling fan", + 255: "Asparagus", + 256: "Jaguar", + 257: "Musical instrument", + 258: "Train", + 259: "Cat", + 260: "Rifle", + 261: "Dumbbell", + 262: "Mobile phone", + 263: "Taxi", + 264: "Shower", + 265: "Pitcher", + 266: "Lemon", + 267: "Invertebrate", + 268: "Turkey", + 269: "High heels", + 270: "Bust", + 271: "Elephant", + 272: "Scarf", + 273: "Barrel", + 274: "Trombone", + 275: "Pumpkin", + 276: "Box", + 277: "Tomato", + 278: "Frog", + 279: "Bidet", + 280: "Human face", + 281: "Houseplant", + 282: "Van", + 283: "Shark", + 284: "Ice cream", + 285: "Swim cap", + 286: "Falcon", + 287: "Ostrich", + 288: "Handgun", + 289: "Whiteboard", + 290: "Lizard", + 291: "Pasta", + 292: "Snowmobile", + 293: "Light bulb", + 294: "Window blind", + 295: "Muffin", + 296: "Pretzel", + 297: "Computer monitor", + 298: "Horn", + 299: "Furniture", + 300: "Sandwich", + 301: "Fox", + 302: "Convenience store", + 303: "Fish", + 304: "Fruit", + 305: "Earrings", + 306: "Curtain", + 307: "Grape", + 308: "Sofa bed", + 309: "Horse", + 310: "Luggage and bags", + 311: "Desk", + 312: "Crutch", + 313: "Bicycle helmet", + 314: "Tick", + 315: "Airplane", + 316: "Canary", + 317: "Spatula", + 318: "Watch", + 319: "Lily", + 320: "Kitchen appliance", + 321: "Filing cabinet", + 322: "Aircraft", + 323: "Cake stand", + 324: "Candy", + 325: "Sink", + 326: "Mouse", + 327: "Wine", + 328: "Wheelchair", + 329: "Goldfish", + 330: "Refrigerator", + 331: "French fries", + 332: "Drawer", + 333: "Treadmill", + 334: "Picnic basket", + 335: "Dice", + 336: "Cabbage", + 337: "Football helmet", + 338: "Pig", + 339: "Person", + 340: "Shorts", + 341: "Gondola", + 342: "Honeycomb", + 343: "Doughnut", + 344: "Chest of drawers", + 345: "Land vehicle", + 346: "Bat", + 347: "Monkey", + 348: "Dagger", + 349: "Tableware", + 350: "Human foot", + 351: "Mug", + 352: "Alarm clock", + 353: "Pressure cooker", + 354: "Human hand", + 355: "Tortoise", + 356: "Baseball glove", + 357: "Sword", + 358: "Pear", + 359: "Miniskirt", + 360: "Traffic sign", + 361: "Girl", + 362: "Roller skates", + 363: "Dinosaur", + 364: "Porch", + 365: "Human beard", + 366: "Submarine sandwich", + 367: "Screwdriver", + 368: "Strawberry", + 369: "Wine glass", + 370: "Seafood", + 371: "Racket", + 372: "Wheel", + 373: "Sea lion", + 374: "Toy", + 375: "Tea", + 376: "Tennis ball", + 377: "Waste container", + 378: "Mule", + 379: "Cricket ball", + 380: "Pineapple", + 381: "Coconut", + 382: "Doll", + 383: "Coffee table", + 384: "Snowman", + 385: "Lavender", + 386: "Shrimp", + 387: "Maple", + 388: "Cowboy hat", + 389: "Goggles", + 390: "Rugby ball", + 391: "Caterpillar", + 392: "Poster", + 393: "Rocket", + 394: "Organ", + 395: "Saxophone", + 396: "Traffic light", + 397: "Cocktail", + 398: "Plastic bag", + 399: "Squash", + 400: "Mushroom", + 401: "Hamburger", + 402: "Light switch", + 403: "Parachute", + 404: "Teddy bear", + 405: "Winter melon", + 406: "Deer", + 407: "Musical keyboard", + 408: "Plumbing fixture", + 409: "Scoreboard", + 410: "Baseball bat", + 411: "Envelope", + 412: "Adhesive tape", + 413: "Briefcase", + 414: "Paddle", + 415: "Bow and arrow", + 416: "Telephone", + 417: "Sheep", + 418: "Jacket", + 419: "Boy", + 420: "Pizza", + 421: "Otter", + 422: "Office supplies", + 423: "Couch", + 424: "Cello", + 425: "Bull", + 426: "Camel", + 427: "Ball", + 428: "Duck", + 429: "Whale", + 430: "Shirt", + 431: "Tank", + 432: "Motorcycle", + 433: "Accordion", + 434: "Owl", + 435: "Porcupine", + 436: "Sun hat", + 437: "Nail", + 438: "Scissors", + 439: "Swan", + 440: "Lamp", + 441: "Crown", + 442: "Piano", + 443: "Sculpture", + 444: "Cheetah", + 445: "Oboe", + 446: "Tin can", + 447: "Mango", + 448: "Tripod", + 449: "Oven", + 450: "Mouse", + 451: "Barge", + 452: "Coffee", + 453: "Snowboard", + 454: "Common fig", + 455: "Salad", + 456: "Marine invertebrates", + 457: "Umbrella", + 458: "Kangaroo", + 459: "Human arm", + 460: "Measuring cup", + 461: "Snail", + 462: "Loveseat", + 463: "Suit", + 464: "Teapot", + 465: "Bottle", + 466: "Alpaca", + 467: "Kettle", + 468: "Trousers", + 469: "Popcorn", + 470: "Centipede", + 471: "Spider", + 472: "Sparrow", + 473: "Plate", + 474: "Bagel", + 475: "Personal care", + 476: "Apple", + 477: "Brassiere", + 478: "Bathroom cabinet", + 479: "studio couch", + 480: "Computer keyboard", + 481: "Table tennis racket", + 482: "Sushi", + 483: "Cabinetry", + 484: "Street light", + 485: "Towel", + 486: "Nightstand", + 487: "Rabbit", + 488: "Dolphin", + 489: "Dog", + 490: "Jug", + 491: "Wok", + 492: "Fire hydrant", + 493: "Human eye", + 494: "Skyscraper", + 495: "Backpack", + 496: "Potato", + 497: "Paper towel", + 498: "Lifejacket", + 499: "Bicycle wheel", + 500: "Toilet", + } + + return clsid2catid, catid2name diff --git a/ppdet/data/source/coco.py b/ppdet/data/source/coco.py new file mode 100644 index 0000000..cf08aad --- /dev/null +++ b/ppdet/data/source/coco.py @@ -0,0 +1,205 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +from ppdet.core.workspace import register, serializable +from .dataset import DetDataset + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class COCODataSet(DetDataset): + """ + Load dataset with COCO format. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): coco annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1): + super(COCODataSet, self).__init__(dataset_dir, image_dir, anno_path, + data_fields, sample_num) + self.load_image_only = False + self.load_semantic = False + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + assert anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + from pycocotools.coco import COCO + coco = COCO(anno_path) + img_ids = coco.getImgIds() + img_ids.sort() + cat_ids = coco.getCatIds() + records = [] + ct = 0 + + self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) + self.cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in self.catid2clsid.items() + }) + + if 'annotations' not in coco.dataset: + self.load_image_only = True + logger.warning('Annotation file: {} does not contains ground truth ' + 'and load image information only.'.format(anno_path)) + + for img_id in img_ids: + img_anno = coco.loadImgs([img_id])[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + im_path = os.path.join(image_dir, + im_fname) if image_dir else im_fname + if not os.path.exists(im_path): + logger.warning('Illegal image file: {}, and it will be ' + 'ignored'.format(im_path)) + continue + + if im_w < 0 or im_h < 0: + logger.warning('Illegal width: {} or height: {} in annotation, ' + 'and im_id: {} will be ignored'.format( + im_w, im_h, img_id)) + continue + + coco_rec = { + 'im_file': im_path, + 'im_id': np.array([img_id]), + 'h': im_h, + 'w': im_w, + } if 'image' in self.data_fields else {} + + if not self.load_image_only: + ins_anno_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=False) + instances = coco.loadAnns(ins_anno_ids) + + bboxes = [] + for inst in instances: + # check gt bbox + if 'bbox' not in inst.keys(): + continue + else: + if not any(np.array(inst['bbox'])): + continue + + # read rbox anno or not + is_rbox_anno = True if len(inst['bbox']) == 5 else False + if is_rbox_anno: + xc, yc, box_w, box_h, angle = inst['bbox'] + x1 = xc - box_w / 2.0 + y1 = yc - box_h / 2.0 + x2 = x1 + box_w + y2 = y1 + box_h + else: + x1, y1, box_w, box_h = inst['bbox'] + x2 = x1 + box_w + y2 = y1 + box_h + eps = 1e-5 + if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: + inst['clean_bbox'] = [ + round(float(x), 3) for x in [x1, y1, x2, y2] + ] + if is_rbox_anno: + inst['clean_rbox'] = [xc, yc, box_w, box_h, angle] + bboxes.append(inst) + else: + logger.warning( + 'Found an invalid bbox in annotations: im_id: {}, ' + 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( + img_id, float(inst['area']), x1, y1, x2, y2)) + + num_bbox = len(bboxes) + if num_bbox <= 0: + continue + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + if is_rbox_anno: + gt_rbox = np.zeros((num_bbox, 5), dtype=np.float32) + gt_theta = np.zeros((num_bbox, 1), dtype=np.int32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + difficult = np.zeros((num_bbox, 1), dtype=np.int32) + gt_poly = [None] * num_bbox + + has_segmentation = False + for i, box in enumerate(bboxes): + catid = box['category_id'] + gt_class[i][0] = self.catid2clsid[catid] + gt_bbox[i, :] = box['clean_bbox'] + # xc, yc, w, h, theta + if is_rbox_anno: + gt_rbox[i, :] = box['clean_rbox'] + is_crowd[i][0] = box['iscrowd'] + # check RLE format + if 'segmentation' in box and box['iscrowd'] == 1: + gt_poly[i] = [[0.0, 0.0], ] + elif 'segmentation' in box and box['segmentation']: + gt_poly[i] = box['segmentation'] + has_segmentation = True + + if has_segmentation and not any(gt_poly): + continue + + if is_rbox_anno: + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_rbox': gt_rbox, + 'gt_poly': gt_poly, + } if 'image' in self.data_fields else {} + else: + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } if 'image' in self.data_fields else {} + + for k, v in gt_rec.items(): + if k in self.data_fields: + coco_rec[k] = v + + # TODO: remove load_semantic + if self.load_semantic and 'semantic' in self.data_fields: + seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', + 'train2017', im_fname[:-3] + 'png') + coco_rec.update({'semantic': seg_path}) + + logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( + im_path, img_id, im_h, im_w)) + records.append(coco_rec) + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert len(records) > 0, 'not found any coco record in %s' % (anno_path) + logger.debug('{} samples in file {}'.format(ct, anno_path)) + self.roidbs = records diff --git a/ppdet/data/source/dataset.py b/ppdet/data/source/dataset.py new file mode 100644 index 0000000..96b8132 --- /dev/null +++ b/ppdet/data/source/dataset.py @@ -0,0 +1,192 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +from collections import OrderedDict +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence +from paddle.io import Dataset +from ppdet.core.workspace import register, serializable +from ppdet.utils.download import get_dataset_path +import copy + + +@serializable +class DetDataset(Dataset): + """ + Load detection dataset. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + use_default_label (bool): whether to load default label list. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + use_default_label=None, + **kwargs): + super(DetDataset, self).__init__() + self.dataset_dir = dataset_dir if dataset_dir is not None else '' + self.anno_path = anno_path + self.image_dir = image_dir if image_dir is not None else '' + self.data_fields = data_fields + self.sample_num = sample_num + self.use_default_label = use_default_label + self._epoch = 0 + self._curr_iter = 0 + + def __len__(self, ): + return len(self.roidbs) + + def __getitem__(self, idx): + # data batch + roidb = copy.deepcopy(self.roidbs[idx]) + if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: + n = len(self.roidbs) + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: + n = len(self.roidbs) + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: + n = len(self.roidbs) + roidb = [roidb, ] + [ + copy.deepcopy(self.roidbs[np.random.randint(n)]) + for _ in range(3) + ] + if isinstance(roidb, Sequence): + for r in roidb: + r['curr_iter'] = self._curr_iter + else: + roidb['curr_iter'] = self._curr_iter + self._curr_iter += 1 + + return self.transform(roidb) + + def check_or_download_dataset(self): + self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path, + self.image_dir) + + def set_kwargs(self, **kwargs): + self.mixup_epoch = kwargs.get('mixup_epoch', -1) + self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) + self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) + + def set_transform(self, transform): + self.transform = transform + + def set_epoch(self, epoch_id): + self._epoch = epoch_id + + def parse_dataset(self, ): + raise NotImplementedError( + "Need to implement parse_dataset method of Dataset") + + def get_anno(self): + if self.anno_path is None: + return + return os.path.join(self.dataset_dir, self.anno_path) + + +def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')): + return f.lower().endswith(extensions) + + +def _make_dataset(dir): + dir = os.path.expanduser(dir) + if not os.path.isdir(dir): + raise ('{} should be a dir'.format(dir)) + images = [] + for root, _, fnames in sorted(os.walk(dir, followlinks=True)): + for fname in sorted(fnames): + path = os.path.join(root, fname) + if _is_valid_file(path): + images.append(path) + return images + + +@register +@serializable +class ImageFolder(DetDataset): + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + sample_num=-1, + use_default_label=None, + **kwargs): + super(ImageFolder, self).__init__( + dataset_dir, + image_dir, + anno_path, + sample_num=sample_num, + use_default_label=use_default_label) + self._imid2path = {} + self.roidbs = None + self.sample_num = sample_num + + def check_or_download_dataset(self): + return + + def parse_dataset(self, ): + if not self.roidbs: + self.roidbs = self._load_images() + + def _parse(self): + image_dir = self.image_dir + if not isinstance(image_dir, Sequence): + image_dir = [image_dir] + images = [] + for im_dir in image_dir: + if os.path.isdir(im_dir): + im_dir = os.path.join(self.dataset_dir, im_dir) + images.extend(_make_dataset(im_dir)) + elif os.path.isfile(im_dir) and _is_valid_file(im_dir): + images.append(im_dir) + return images + + def _load_images(self): + images = self._parse() + ct = 0 + records = [] + for image in images: + assert image != '' and os.path.isfile(image), \ + "Image {} not found".format(image) + if self.sample_num > 0 and ct >= self.sample_num: + break + rec = {'im_id': np.array([ct]), 'im_file': image} + self._imid2path[ct] = image + ct += 1 + records.append(rec) + assert len(records) > 0, "No image file found" + return records + + def get_imid2path(self): + return self._imid2path + + def set_images(self, images): + self.image_dir = images + self.roidbs = self._load_images() diff --git a/ppdet/data/source/voc.py b/ppdet/data/source/voc.py new file mode 100644 index 0000000..56b746c --- /dev/null +++ b/ppdet/data/source/voc.py @@ -0,0 +1,204 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np + +import xml.etree.ElementTree as ET + +from ppdet.core.workspace import register, serializable + +from .dataset import DetDataset + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class VOCDataSet(DetDataset): + """ + Load dataset with PascalVOC format. + + Notes: + `anno_path` must contains xml file and image file path for annotations. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): voc annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + label_list (str): if use_default_label is False, will load + mapping between category and class index. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + label_list=None): + super(VOCDataSet, self).__init__( + dataset_dir=dataset_dir, + image_dir=image_dir, + anno_path=anno_path, + data_fields=data_fields, + sample_num=sample_num) + self.label_list = label_list + + def parse_dataset(self, ): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + # mapping category name to class id + # first_class:0, second_class:1, ... + records = [] + ct = 0 + cname2cid = {} + if self.label_list: + label_path = os.path.join(self.dataset_dir, self.label_list) + if not os.path.exists(label_path): + raise ValueError("label_list {} does not exists".format( + label_path)) + with open(label_path, 'r') as fr: + label_id = 0 + for line in fr.readlines(): + cname2cid[line.strip()] = label_id + label_id += 1 + else: + cname2cid = pascalvoc_label() + + with open(anno_path, 'r') as fr: + while True: + line = fr.readline() + if not line: + break + img_file, xml_file = [os.path.join(image_dir, x) \ + for x in line.strip().split()[:2]] + if not os.path.exists(img_file): + logger.warn( + 'Illegal image file: {}, and it will be ignored'.format( + img_file)) + continue + if not os.path.isfile(xml_file): + logger.warn('Illegal xml file: {}, and it will be ignored'. + format(xml_file)) + continue + tree = ET.parse(xml_file) + if tree.find('id') is None: + im_id = np.array([ct]) + else: + im_id = np.array([int(tree.find('id').text)]) + + objs = tree.findall('object') + im_w = float(tree.find('size').find('width').text) + im_h = float(tree.find('size').find('height').text) + if im_w < 0 or im_h < 0: + logger.warn( + 'Illegal width: {} or height: {} in annotation, ' + 'and {} will be ignored'.format(im_w, im_h, xml_file)) + continue + gt_bbox = [] + gt_class = [] + gt_score = [] + difficult = [] + for i, obj in enumerate(objs): + cname = obj.find('name').text + + # user dataset may not contain difficult field + _difficult = obj.find('difficult') + _difficult = int( + _difficult.text) if _difficult is not None else 0 + + x1 = float(obj.find('bndbox').find('xmin').text) + y1 = float(obj.find('bndbox').find('ymin').text) + x2 = float(obj.find('bndbox').find('xmax').text) + y2 = float(obj.find('bndbox').find('ymax').text) + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(im_w - 1, x2) + y2 = min(im_h - 1, y2) + if x2 > x1 and y2 > y1: + gt_bbox.append([x1, y1, x2, y2]) + gt_class.append([cname2cid[cname]]) + gt_score.append([1.]) + difficult.append([_difficult]) + else: + logger.warn( + 'Found an invalid bbox in annotations: xml_file: {}' + ', x1: {}, y1: {}, x2: {}, y2: {}.'.format( + xml_file, x1, y1, x2, y2)) + gt_bbox = np.array(gt_bbox).astype('float32') + gt_class = np.array(gt_class).astype('int32') + gt_score = np.array(gt_score).astype('float32') + difficult = np.array(difficult).astype('int32') + + voc_rec = { + 'im_file': img_file, + 'im_id': im_id, + 'h': im_h, + 'w': im_w + } if 'image' in self.data_fields else {} + + gt_rec = { + 'gt_class': gt_class, + 'gt_score': gt_score, + 'gt_bbox': gt_bbox, + 'difficult': difficult + } + for k, v in gt_rec.items(): + if k in self.data_fields: + voc_rec[k] = v + + if len(objs) != 0: + records.append(voc_rec) + + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert len(records) > 0, 'not found any voc record in %s' % ( + self.anno_path) + logger.debug('{} samples in file {}'.format(ct, anno_path)) + self.roidbs, self.cname2cid = records, cname2cid + + def get_label_list(self): + return os.path.join(self.dataset_dir, self.label_list) + + +def pascalvoc_label(): + labels_map = { + 'aeroplane': 0, + 'bicycle': 1, + 'bird': 2, + 'boat': 3, + 'bottle': 4, + 'bus': 5, + 'car': 6, + 'cat': 7, + 'chair': 8, + 'cow': 9, + 'diningtable': 10, + 'dog': 11, + 'horse': 12, + 'motorbike': 13, + 'person': 14, + 'pottedplant': 15, + 'sheep': 16, + 'sofa': 17, + 'train': 18, + 'tvmonitor': 19 + } + return labels_map diff --git a/ppdet/data/source/widerface.py b/ppdet/data/source/widerface.py new file mode 100644 index 0000000..b1813b0 --- /dev/null +++ b/ppdet/data/source/widerface.py @@ -0,0 +1,180 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np + +from ppdet.core.workspace import register, serializable +from .dataset import DetDataset + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class WIDERFaceDataSet(DetDataset): + """ + Load WiderFace records with 'anno_path' + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): WiderFace annotation data. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + with_lmk (bool): whether to load face landmark keypoint labels. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + with_lmk=False): + super(WIDERFaceDataSet, self).__init__( + dataset_dir=dataset_dir, + image_dir=image_dir, + anno_path=anno_path, + data_fields=data_fields, + sample_num=sample_num, + with_lmk=with_lmk) + self.anno_path = anno_path + self.sample_num = sample_num + self.roidbs = None + self.cname2cid = None + self.with_lmk = with_lmk + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + txt_file = anno_path + + records = [] + ct = 0 + file_lists = self._load_file_list(txt_file) + cname2cid = widerface_label() + + for item in file_lists: + im_fname = item[0] + im_id = np.array([ct]) + gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32) + gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32) + gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32) + lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32) + for index_box in range(len(item)): + if index_box < 1: + continue + gt_bbox[index_box - 1] = item[index_box][0] + if self.with_lmk: + gt_lmk_labels[index_box - 1] = item[index_box][1] + lmk_ignore_flag[index_box - 1] = item[index_box][2] + im_fname = os.path.join(image_dir, + im_fname) if image_dir else im_fname + widerface_rec = { + 'im_file': im_fname, + 'im_id': im_id, + } if 'image' in self.data_fields else {} + gt_rec = { + 'gt_bbox': gt_bbox, + 'gt_class': gt_class, + } + for k, v in gt_rec.items(): + if k in self.data_fields: + widerface_rec[k] = v + if self.with_lmk: + widerface_rec['gt_keypoint'] = gt_lmk_labels + widerface_rec['keypoint_ignore'] = lmk_ignore_flag + + if len(item) != 0: + records.append(widerface_rec) + + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert len(records) > 0, 'not found any widerface in %s' % (anno_path) + logger.debug('{} samples in file {}'.format(ct, anno_path)) + self.roidbs, self.cname2cid = records, cname2cid + + def _load_file_list(self, input_txt): + with open(input_txt, 'r') as f_dir: + lines_input_txt = f_dir.readlines() + + file_dict = {} + num_class = 0 + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for i in range(len(lines_input_txt)): + line_txt = lines_input_txt[i].strip('\n\t\r') + split_str = line_txt.split(' ') + if len(split_str) == 1: + img_file_name = os.path.split(split_str[0])[1] + split_txt = img_file_name.split('.') + if len(split_txt) < 2: + continue + elif split_txt[-1] in exts: + if i != 0: + num_class += 1 + file_dict[num_class] = [line_txt] + else: + if len(line_txt) <= 6: + continue + result_boxs = [] + xmin = float(split_str[0]) + ymin = float(split_str[1]) + w = float(split_str[2]) + h = float(split_str[3]) + # Filter out wrong labels + if w < 0 or h < 0: + logger.warn('Illegal box with w: {}, h: {} in ' + 'img: {}, and it will be ignored'.format( + w, h, file_dict[num_class][0])) + continue + xmin = max(0, xmin) + ymin = max(0, ymin) + xmax = xmin + w + ymax = ymin + h + gt_bbox = [xmin, ymin, xmax, ymax] + result_boxs.append(gt_bbox) + if self.with_lmk: + assert len(split_str) > 18, 'When `with_lmk=True`, the number' \ + 'of characters per line in the annotation file should' \ + 'exceed 18.' + lmk0_x = float(split_str[5]) + lmk0_y = float(split_str[6]) + lmk1_x = float(split_str[8]) + lmk1_y = float(split_str[9]) + lmk2_x = float(split_str[11]) + lmk2_y = float(split_str[12]) + lmk3_x = float(split_str[14]) + lmk3_y = float(split_str[15]) + lmk4_x = float(split_str[17]) + lmk4_y = float(split_str[18]) + lmk_ignore_flag = 0 if lmk0_x == -1 else 1 + gt_lmk_label = [ + lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x, + lmk3_y, lmk4_x, lmk4_y + ] + result_boxs.append(gt_lmk_label) + result_boxs.append(lmk_ignore_flag) + file_dict[num_class].append(result_boxs) + + return list(file_dict.values()) + + +def widerface_label(): + labels_map = {'face': 0} + return labels_map diff --git a/ppdet/data/transform/__init__.py b/ppdet/data/transform/__init__.py new file mode 100644 index 0000000..c5deb53 --- /dev/null +++ b/ppdet/data/transform/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import operators +from . import batch_operators + +from .operators import * +from .batch_operators import * + +__all__ = [] +__all__ += registered_ops diff --git a/ppdet/data/transform/__pycache__/__init__.cpython-38.pyc b/ppdet/data/transform/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..590acd8 Binary files /dev/null and b/ppdet/data/transform/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/data/transform/__pycache__/__init__.cpython-39.pyc b/ppdet/data/transform/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..e8df832 Binary files /dev/null and b/ppdet/data/transform/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/data/transform/__pycache__/batch_operators.cpython-38.pyc b/ppdet/data/transform/__pycache__/batch_operators.cpython-38.pyc new file mode 100644 index 0000000..52900b8 Binary files /dev/null and b/ppdet/data/transform/__pycache__/batch_operators.cpython-38.pyc differ diff --git a/ppdet/data/transform/__pycache__/batch_operators.cpython-39.pyc b/ppdet/data/transform/__pycache__/batch_operators.cpython-39.pyc new file mode 100644 index 0000000..7d16fa0 Binary files /dev/null and b/ppdet/data/transform/__pycache__/batch_operators.cpython-39.pyc differ diff --git a/ppdet/data/transform/__pycache__/op_helper.cpython-38.pyc b/ppdet/data/transform/__pycache__/op_helper.cpython-38.pyc new file mode 100644 index 0000000..eb77d17 Binary files /dev/null and b/ppdet/data/transform/__pycache__/op_helper.cpython-38.pyc differ diff --git a/ppdet/data/transform/__pycache__/op_helper.cpython-39.pyc b/ppdet/data/transform/__pycache__/op_helper.cpython-39.pyc new file mode 100644 index 0000000..21d7896 Binary files /dev/null and b/ppdet/data/transform/__pycache__/op_helper.cpython-39.pyc differ diff --git a/ppdet/data/transform/__pycache__/operators.cpython-38.pyc b/ppdet/data/transform/__pycache__/operators.cpython-38.pyc new file mode 100644 index 0000000..4d9657a Binary files /dev/null and b/ppdet/data/transform/__pycache__/operators.cpython-38.pyc differ diff --git a/ppdet/data/transform/__pycache__/operators.cpython-39.pyc b/ppdet/data/transform/__pycache__/operators.cpython-39.pyc new file mode 100644 index 0000000..7d64e29 Binary files /dev/null and b/ppdet/data/transform/__pycache__/operators.cpython-39.pyc differ diff --git a/ppdet/data/transform/autoaugment_utils.py b/ppdet/data/transform/autoaugment_utils.py new file mode 100644 index 0000000..78e3bb3 --- /dev/null +++ b/ppdet/data/transform/autoaugment_utils.py @@ -0,0 +1,1588 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Reference: +# https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py +"""AutoAugment util file.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import inspect +import math +from PIL import Image, ImageEnhance +import numpy as np +import os +import sys +import cv2 +from copy import deepcopy + +# This signifies the max integer that the controller RNN could predict for the +# augmentation scheme. +_MAX_LEVEL = 10. + +# Represents an invalid bounding box that is used for checking for padding +# lists of bounding box coordinates for a few augmentation operations +_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] + + +def policy_v0(): + """Autoaugment policy that was used in AutoAugment Detection Paper.""" + # Each tuple is an augmentation operation of the form + # (operation, probability, magnitude). Each element in policy is a + # sub-policy that will be applied sequentially on the image. + policy = [ + [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], + [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], + [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], + [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], + [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], + ] + return policy + + +def policy_v1(): + """Autoaugment policy that was used in AutoAugment Detection Paper.""" + # Each tuple is an augmentation operation of the form + # (operation, probability, magnitude). Each element in policy is a + # sub-policy that will be applied sequentially on the image. + policy = [ + [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], + [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], + [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], + [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], + [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], + [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)], + [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)], + [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)], + [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)], + [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # , + [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)], + [('Color', 1.0, 6), ('Equalize', 1.0, 2)], + [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)], + [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)], + [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)], + [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)], + [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)], + [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)], + [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)], + [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)], + ] + return policy + + +def policy_vtest(): + """Autoaugment test policy for debugging.""" + # Each tuple is an augmentation operation of the form + # (operation, probability, magnitude). Each element in policy is a + # sub-policy that will be applied sequentially on the image. + policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ] + return policy + + +def policy_v2(): + """Additional policy that performs well on object detection.""" + # Each tuple is an augmentation operation of the form + # (operation, probability, magnitude). Each element in policy is a + # sub-policy that will be applied sequentially on the image. + policy = [ + [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)], + [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2), + ('Rotate_BBox', 0.8, 10)], + [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)], + [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8), + ('Brightness', 0.0, 10)], + [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10), + ('AutoContrast', 0.6, 0)], + [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)], + [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8), + ('Solarize', 0.0, 10)], + [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8), + ('Rotate_BBox', 0.8, 8)], + [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)], + [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6), + ('Rotate_BBox', 0.6, 6)], + [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)], + [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6), + ('ShearY_BBox', 0.6, 8)], + [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2), + ('Brightness', 0.2, 2)], + [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6), + ('SolarizeAdd', 0.2, 10)], + [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)], + ] + return policy + + +def policy_v3(): + """"Additional policy that performs well on object detection.""" + # Each tuple is an augmentation operation of the form + # (operation, probability, magnitude). Each element in policy is a + # sub-policy that will be applied sequentially on the image. + policy = [ + [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)], + [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)], + [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)], + [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)], + [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)], + [('Sharpness', 0.0, 2), ('Color', 0.4, 8)], + [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)], + [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)], + [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)], + [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)], + [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)], + [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)], + [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)], + [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)], + [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)], + ] + return policy + + +def _equal(val1, val2, eps=1e-8): + return abs(val1 - val2) <= eps + + +def blend(image1, image2, factor): + """Blend image1 and image2 using 'factor'. + + Factor can be above 0.0. A value of 0.0 means only image1 is used. + A value of 1.0 means only image2 is used. A value between 0.0 and + 1.0 means we linearly interpolate the pixel values between the two + images. A value greater than 1.0 "extrapolates" the difference + between the two pixel values, and we clip the results to values + between 0 and 255. + + Args: + image1: An image Tensor of type uint8. + image2: An image Tensor of type uint8. + factor: A floating point value above 0.0. + + Returns: + A blended image Tensor of type uint8. + """ + if factor == 0.0: + return image1 + if factor == 1.0: + return image2 + + image1 = image1.astype(np.float32) + image2 = image2.astype(np.float32) + + difference = image2 - image1 + scaled = factor * difference + + # Do addition in float. + temp = image1 + scaled + + # Interpolate + if factor > 0.0 and factor < 1.0: + # Interpolation means we always stay within 0 and 255. + return temp.astype(np.uint8) + + # Extrapolate: + # + # We need to clip and then cast. + return np.clip(temp, a_min=0, a_max=255).astype(np.uint8) + + +def cutout(image, pad_size, replace=0): + """Apply cutout (https://arxiv.org/abs/1708.04552) to image. + + This operation applies a (2*pad_size x 2*pad_size) mask of zeros to + a random location within `img`. The pixel values filled in will be of the + value `replace`. The located where the mask will be applied is randomly + chosen uniformly over the whole image. + + Args: + image: An image Tensor of type uint8. + pad_size: Specifies how big the zero mask that will be generated is that + is applied to the image. The mask will be of size + (2*pad_size x 2*pad_size). + replace: What pixel value to fill in the image in the area that has + the cutout mask applied to it. + + Returns: + An image Tensor that is of type uint8. + Example: + img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB ) + new_img = cutout(img, pad_size=50, replace=0) + """ + image_height, image_width = image.shape[0], image.shape[1] + + cutout_center_height = np.random.randint(low=0, high=image_height) + cutout_center_width = np.random.randint(low=0, high=image_width) + + lower_pad = np.maximum(0, cutout_center_height - pad_size) + upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size) + left_pad = np.maximum(0, cutout_center_width - pad_size) + right_pad = np.maximum(0, image_width - cutout_center_width - pad_size) + + cutout_shape = [ + image_height - (lower_pad + upper_pad), + image_width - (left_pad + right_pad) + ] + padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] + mask = np.pad(np.zeros( + cutout_shape, dtype=image.dtype), + padding_dims, + 'constant', + constant_values=1) + mask = np.expand_dims(mask, -1) + mask = np.tile(mask, [1, 1, 3]) + image = np.where( + np.equal(mask, 0), + np.ones_like( + image, dtype=image.dtype) * replace, + image) + return image.astype(np.uint8) + + +def solarize(image, threshold=128): + # For each pixel in the image, select the pixel + # if the value is less than the threshold. + # Otherwise, subtract 255 from the pixel. + return np.where(image < threshold, image, 255 - image) + + +def solarize_add(image, addition=0, threshold=128): + # For each pixel in the image less than threshold + # we add 'addition' amount to it and then clip the + # pixel value to be between 0 and 255. The value + # of 'addition' is between -128 and 128. + added_image = image.astype(np.int64) + addition + added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8) + return np.where(image < threshold, added_image, image) + + +def color(image, factor): + """use cv2 to deal""" + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) + return blend(degenerate, image, factor) + + +# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197 +def contrast(img, factor): + img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor) + return np.array(img) + + +def brightness(image, factor): + """Equivalent of PIL Brightness.""" + degenerate = np.zeros_like(image) + return blend(degenerate, image, factor) + + +def posterize(image, bits): + """Equivalent of PIL Posterize.""" + shift = 8 - bits + return np.left_shift(np.right_shift(image, shift), shift) + + +def rotate(image, degrees, replace): + """Rotates the image by degrees either clockwise or counterclockwise. + + Args: + image: An image Tensor of type uint8. + degrees: Float, a scalar angle in degrees to rotate all images by. If + degrees is positive the image will be rotated clockwise otherwise it will + be rotated counterclockwise. + replace: A one or three value 1D tensor to fill empty pixels caused by + the rotate operation. + + Returns: + The rotated version of image. + """ + image = wrap(image) + image = Image.fromarray(image) + image = image.rotate(degrees) + image = np.array(image, dtype=np.uint8) + return unwrap(image, replace) + + +def random_shift_bbox(image, + bbox, + pixel_scaling, + replace, + new_min_bbox_coords=None): + """Move the bbox and the image content to a slightly new random location. + + Args: + image: 3D uint8 Tensor. + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + The potential values for the new min corner of the bbox will be between + [old_min - pixel_scaling * bbox_height/2, + old_min - pixel_scaling * bbox_height/2]. + pixel_scaling: A float between 0 and 1 that specifies the pixel range + that the new bbox location will be sampled from. + replace: A one or three value 1D tensor to fill empty pixels. + new_min_bbox_coords: If not None, then this is a tuple that specifies the + (min_y, min_x) coordinates of the new bbox. Normally this is randomly + specified, but this allows it to be manually set. The coordinates are + the absolute coordinates between 0 and image height/width and are int32. + + Returns: + The new image that will have the shifted bbox location in it along with + the new bbox that contains the new coordinates. + """ + # Obtains image height and width and create helper clip functions. + image_height, image_width = image.shape[0], image.shape[1] + image_height = float(image_height) + image_width = float(image_width) + + def clip_y(val): + return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32) + + def clip_x(val): + return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32) + + # Convert bbox to pixel coordinates. + min_y = int(image_height * bbox[0]) + min_x = int(image_width * bbox[1]) + max_y = clip_y(image_height * bbox[2]) + max_x = clip_x(image_width * bbox[3]) + + bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1) + image_height = int(image_height) + image_width = int(image_width) + + # Select the new min/max bbox ranges that are used for sampling the + # new min x/y coordinates of the shifted bbox. + minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) / + 2.0)) + maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) / + 2.0)) + minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0)) + maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0)) + + # Sample and calculate the new unclipped min/max coordinates of the new bbox. + if new_min_bbox_coords is None: + unclipped_new_min_y = np.random.randint( + low=minval_y, high=maxval_y, dtype=np.int32) + unclipped_new_min_x = np.random.randint( + low=minval_x, high=maxval_x, dtype=np.int32) + else: + unclipped_new_min_y, unclipped_new_min_x = ( + clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1])) + unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1 + unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1 + + # Determine if any of the new bbox was shifted outside the current image. + # This is used for determining if any of the original bbox content should be + # discarded. + new_min_y, new_min_x, new_max_y, new_max_x = ( + clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x), + clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x)) + shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y + shifted_max_y = max_y - (unclipped_new_max_y - new_max_y) + shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x + shifted_max_x = max_x - (unclipped_new_max_x - new_max_x) + + # Create the new bbox tensor by converting pixel integer values to floats. + new_bbox = np.stack([ + float(new_min_y) / float(image_height), float(new_min_x) / + float(image_width), float(new_max_y) / float(image_height), + float(new_max_x) / float(image_width) + ]) + + # Copy the contents in the bbox and fill the old bbox location + # with gray (128). + bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x: + shifted_max_x + 1, :] + + def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor, + image_): + """Applies mask to bbox region in image then adds content_tensor to it.""" + mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_], + [min_x_, (image_width - 1) - max_x_], [0, 0]], + 'constant', + constant_values=1) + + content_tensor = np.pad(content_tensor, + [[min_y_, (image_height - 1) - max_y_], + [min_x_, (image_width - 1) - max_x_], [0, 0]], + 'constant', + constant_values=0) + return image_ * mask + content_tensor + + # Zero out original bbox location. + mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :] + grey_tensor = np.zeros_like(mask) + replace[0] + image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor, + image) + + # Fill in bbox content to new bbox location. + mask = np.zeros_like(bbox_content) + image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask, + bbox_content, image) + + return image.astype(np.uint8), new_bbox + + +def _clip_bbox(min_y, min_x, max_y, max_x): + """Clip bounding box coordinates between 0 and 1. + + Args: + min_y: Normalized bbox coordinate of type float between 0 and 1. + min_x: Normalized bbox coordinate of type float between 0 and 1. + max_y: Normalized bbox coordinate of type float between 0 and 1. + max_x: Normalized bbox coordinate of type float between 0 and 1. + + Returns: + Clipped coordinate values between 0 and 1. + """ + min_y = np.clip(min_y, a_min=0, a_max=1.0) + min_x = np.clip(min_x, a_min=0, a_max=1.0) + max_y = np.clip(max_y, a_min=0, a_max=1.0) + max_x = np.clip(max_x, a_min=0, a_max=1.0) + return min_y, min_x, max_y, max_x + + +def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): + """Adjusts bbox coordinates to make sure the area is > 0. + + Args: + min_y: Normalized bbox coordinate of type float between 0 and 1. + min_x: Normalized bbox coordinate of type float between 0 and 1. + max_y: Normalized bbox coordinate of type float between 0 and 1. + max_x: Normalized bbox coordinate of type float between 0 and 1. + delta: Float, this is used to create a gap of size 2 * delta between + bbox min/max coordinates that are the same on the boundary. + This prevents the bbox from having an area of zero. + + Returns: + Tuple of new bbox coordinates between 0 and 1 that will now have a + guaranteed area > 0. + """ + height = max_y - min_y + width = max_x - min_x + + def _adjust_bbox_boundaries(min_coord, max_coord): + # Make sure max is never 0 and min is never 1. + max_coord = np.maximum(max_coord, 0.0 + delta) + min_coord = np.minimum(min_coord, 1.0 - delta) + return min_coord, max_coord + + if _equal(height, 0): + min_y, max_y = _adjust_bbox_boundaries(min_y, max_y) + + if _equal(width, 0): + min_x, max_x = _adjust_bbox_boundaries(min_x, max_x) + + return min_y, min_x, max_y, max_x + + +def _scale_bbox_only_op_probability(prob): + """Reduce the probability of the bbox-only operation. + + Probability is reduced so that we do not distort the content of too many + bounding boxes that are close to each other. The value of 3.0 was a chosen + hyper parameter when designing the autoaugment algorithm that we found + empirically to work well. + + Args: + prob: Float that is the probability of applying the bbox-only operation. + + Returns: + Reduced probability. + """ + return prob / 3.0 + + +def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): + """Applies augmentation_func to the subsection of image indicated by bbox. + + Args: + image: 3D uint8 Tensor. + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + augmentation_func: Augmentation function that will be applied to the + subsection of image. + *args: Additional parameters that will be passed into augmentation_func + when it is called. + + Returns: + A modified version of image, where the bbox location in the image will + have `ugmentation_func applied to it. + """ + image_height = image.shape[0] + image_width = image.shape[1] + + min_y = int(image_height * bbox[0]) + min_x = int(image_width * bbox[1]) + max_y = int(image_height * bbox[2]) + max_x = int(image_width * bbox[3]) + + # Clip to be sure the max values do not fall out of range. + max_y = np.minimum(max_y, image_height - 1) + max_x = np.minimum(max_x, image_width - 1) + + # Get the sub-tensor that is the image within the bounding box region. + bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] + + # Apply the augmentation function to the bbox portion of the image. + augmented_bbox_content = augmentation_func(bbox_content, *args) + + # Pad the augmented_bbox_content and the mask to match the shape of original + # image. + augmented_bbox_content = np.pad( + augmented_bbox_content, [[min_y, (image_height - 1) - max_y], + [min_x, (image_width - 1) - max_x], [0, 0]], + 'constant', + constant_values=1) + + # Create a mask that will be used to zero out a part of the original image. + mask_tensor = np.zeros_like(bbox_content) + + mask_tensor = np.pad(mask_tensor, + [[min_y, (image_height - 1) - max_y], + [min_x, (image_width - 1) - max_x], [0, 0]], + 'constant', + constant_values=1) + # Replace the old bbox content with the new augmented content. + image = image * mask_tensor + augmented_bbox_content + return image.astype(np.uint8) + + +def _concat_bbox(bbox, bboxes): + """Helper function that concates bbox to bboxes along the first dimension.""" + + # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means + # we discard bboxes and start the bboxes Tensor with the current bbox. + bboxes_sum_check = np.sum(bboxes) + bbox = np.expand_dims(bbox, 0) + # This check will be true when it is an _INVALID_BOX + if _equal(bboxes_sum_check, -4): + bboxes = bbox + else: + bboxes = np.concatenate([bboxes, bbox], 0) + return bboxes + + +def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, + augmentation_func, func_changes_bbox, + *args): + """Applies _apply_bbox_augmentation with probability prob. + + Args: + image: 3D uint8 Tensor. + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + new_bboxes: 2D Tensor that is a list of the bboxes in the image after they + have been altered by aug_func. These will only be changed when + func_changes_bbox is set to true. Each bbox has 4 elements + (min_y, min_x, max_y, max_x) of type float that are the normalized + bbox coordinates between 0 and 1. + prob: Float that is the probability of applying _apply_bbox_augmentation. + augmentation_func: Augmentation function that will be applied to the + subsection of image. + func_changes_bbox: Boolean. Does augmentation_func return bbox in addition + to image. + *args: Additional parameters that will be passed into augmentation_func + when it is called. + + Returns: + A tuple. Fist element is a modified version of image, where the bbox + location in the image will have augmentation_func applied to it if it is + chosen to be called with probability `prob`. The second element is a + Tensor of Tensors of length 4 that will contain the altered bbox after + applying augmentation_func. + """ + should_apply_op = (np.random.rand() + prob >= 1) + if func_changes_bbox: + if should_apply_op: + augmented_image, bbox = augmentation_func(image, bbox, *args) + else: + augmented_image, bbox = (image, bbox) + else: + if should_apply_op: + augmented_image = _apply_bbox_augmentation(image, bbox, + augmentation_func, *args) + else: + augmented_image = image + new_bboxes = _concat_bbox(bbox, new_bboxes) + return augmented_image.astype(np.uint8), new_bboxes + + +def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, + func_changes_bbox, *args): + """Applies aug_func to the image for each bbox in bboxes. + + Args: + image: 3D uint8 Tensor. + bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox + has 4 elements (min_y, min_x, max_y, max_x) of type float. + prob: Float that is the probability of applying aug_func to a specific + bounding box within the image. + aug_func: Augmentation function that will be applied to the + subsections of image indicated by the bbox values in bboxes. + func_changes_bbox: Boolean. Does augmentation_func return bbox in addition + to image. + *args: Additional parameters that will be passed into augmentation_func + when it is called. + + Returns: + A modified version of image, where each bbox location in the image will + have augmentation_func applied to it if it is chosen to be called with + probability prob independently across all bboxes. Also the final + bboxes are returned that will be unchanged if func_changes_bbox is set to + false and if true, the new altered ones will be returned. + """ + # Will keep track of the new altered bboxes after aug_func is repeatedly + # applied. The -1 values are a dummy value and this first Tensor will be + # removed upon appending the first real bbox. + new_bboxes = np.array(_INVALID_BOX) + + # If the bboxes are empty, then just give it _INVALID_BOX. The result + # will be thrown away. + bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes + + assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!" + + # pylint:disable=g-long-lambda + # pylint:disable=line-too-long + wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args) + # pylint:enable=g-long-lambda + # pylint:enable=line-too-long + + # Setup the while_loop. + num_bboxes = bboxes.shape[0] # We loop until we go over all bboxes. + idx = 0 # Counter for the while loop. + + # Conditional function when to end the loop once we go over all bboxes + # images_and_bboxes contain (_image, _new_bboxes) + def cond(_idx, _images_and_bboxes): + return _idx < num_bboxes + + # Shuffle the bboxes so that the augmentation order is not deterministic if + # we are not changing the bboxes with aug_func. + # if not func_changes_bbox: + # print(bboxes) + # loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0) + # print(loop_bboxes) + # else: + # loop_bboxes = bboxes + # we can not shuffle the bbox because it does not contain class information here + loop_bboxes = deepcopy(bboxes) + + # Main function of while_loop where we repeatedly apply augmentation on the + # bboxes in the image. + # pylint:disable=g-long-lambda + body = lambda _idx, _images_and_bboxes: [ + _idx + 1, wrapped_aug_func(_images_and_bboxes[0], + loop_bboxes[_idx], + _images_and_bboxes[1])] + while (cond(idx, (image, new_bboxes))): + idx, (image, new_bboxes) = body(idx, (image, new_bboxes)) + + # Either return the altered bboxes or the original ones depending on if + # we altered them in anyway. + if func_changes_bbox: + final_bboxes = new_bboxes + else: + final_bboxes = bboxes + return image, final_bboxes + + +def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, + func_changes_bbox, *args): + """Checks to be sure num bboxes > 0 before calling inner function.""" + num_bboxes = len(bboxes) + new_image = deepcopy(image) + new_bboxes = deepcopy(bboxes) + if num_bboxes != 0: + new_image, new_bboxes = _apply_multi_bbox_augmentation( + new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args) + return new_image, new_bboxes + + +def rotate_only_bboxes(image, bboxes, prob, degrees, replace): + """Apply rotate to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, rotate, func_changes_bbox, degrees, replace) + + +def shear_x_only_bboxes(image, bboxes, prob, level, replace): + """Apply shear_x to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, shear_x, func_changes_bbox, level, replace) + + +def shear_y_only_bboxes(image, bboxes, prob, level, replace): + """Apply shear_y to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, shear_y, func_changes_bbox, level, replace) + + +def translate_x_only_bboxes(image, bboxes, prob, pixels, replace): + """Apply translate_x to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace) + + +def translate_y_only_bboxes(image, bboxes, prob, pixels, replace): + """Apply translate_y to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) + + +def flip_only_bboxes(image, bboxes, prob): + """Apply flip_lr to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, + np.fliplr, func_changes_bbox) + + +def solarize_only_bboxes(image, bboxes, prob, threshold): + """Apply solarize to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize, + func_changes_bbox, threshold) + + +def equalize_only_bboxes(image, bboxes, prob): + """Apply equalize to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize, + func_changes_bbox) + + +def cutout_only_bboxes(image, bboxes, prob, pad_size, replace): + """Apply cutout to each bbox in the image with probability prob.""" + func_changes_bbox = False + prob = _scale_bbox_only_op_probability(prob) + return _apply_multi_bbox_augmentation_wrapper( + image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace) + + +def _rotate_bbox(bbox, image_height, image_width, degrees): + """Rotates the bbox coordinated by degrees. + + Args: + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + image_height: Int, height of the image. + image_width: Int, height of the image. + degrees: Float, a scalar angle in degrees to rotate all images by. If + degrees is positive the image will be rotated clockwise otherwise it will + be rotated counterclockwise. + + Returns: + A tensor of the same shape as bbox, but now with the rotated coordinates. + """ + image_height, image_width = (float(image_height), float(image_width)) + + # Convert from degrees to radians. + degrees_to_radians = math.pi / 180.0 + radians = degrees * degrees_to_radians + + # Translate the bbox to the center of the image and turn the normalized 0-1 + # coordinates to absolute pixel locations. + # Y coordinates are made negative as the y axis of images goes down with + # increasing pixel values, so we negate to make sure x axis and y axis points + # are in the traditionally positive direction. + min_y = -int(image_height * (bbox[0] - 0.5)) + min_x = int(image_width * (bbox[1] - 0.5)) + max_y = -int(image_height * (bbox[2] - 0.5)) + max_x = int(image_width * (bbox[3] - 0.5)) + coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x], + [max_y, max_x]]).astype(np.float32) + # Rotate the coordinates according to the rotation matrix clockwise if + # radians is positive, else negative + rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)], + [-math.sin(radians), math.cos(radians)]]) + new_coords = np.matmul(rotation_matrix, + np.transpose(coordinates)).astype(np.int32) + + # Find min/max values and convert them back to normalized 0-1 floats. + min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5) + min_x = float(np.min(new_coords[1, :])) / image_width + 0.5 + max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5) + max_x = float(np.max(new_coords[1, :])) / image_width + 0.5 + + # Clip the bboxes to be sure the fall between [0, 1]. + min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) + min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) + return np.stack([min_y, min_x, max_y, max_x]) + + +def rotate_with_bboxes(image, bboxes, degrees, replace): + # Rotate the image. + image = rotate(image, degrees, replace) + + # Convert bbox coordinates to pixel values. + image_height, image_width = image.shape[:2] + # pylint:disable=g-long-lambda + wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees) + # pylint:enable=g-long-lambda + new_bboxes = np.zeros_like(bboxes) + for idx in range(len(bboxes)): + new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx]) + return image, new_bboxes + + +def translate_x(image, pixels, replace): + """Equivalent of PIL Translate in X dimension.""" + image = Image.fromarray(wrap(image)) + image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0)) + return unwrap(np.array(image), replace) + + +def translate_y(image, pixels, replace): + """Equivalent of PIL Translate in Y dimension.""" + image = Image.fromarray(wrap(image)) + image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels)) + return unwrap(np.array(image), replace) + + +def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): + """Shifts the bbox coordinates by pixels. + + Args: + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + image_height: Int, height of the image. + image_width: Int, width of the image. + pixels: An int. How many pixels to shift the bbox. + shift_horizontal: Boolean. If true then shift in X dimension else shift in + Y dimension. + + Returns: + A tensor of the same shape as bbox, but now with the shifted coordinates. + """ + pixels = int(pixels) + # Convert bbox to integer pixel locations. + min_y = int(float(image_height) * bbox[0]) + min_x = int(float(image_width) * bbox[1]) + max_y = int(float(image_height) * bbox[2]) + max_x = int(float(image_width) * bbox[3]) + + if shift_horizontal: + min_x = np.maximum(0, min_x - pixels) + max_x = np.minimum(image_width, max_x - pixels) + else: + min_y = np.maximum(0, min_y - pixels) + max_y = np.minimum(image_height, max_y - pixels) + + # Convert bbox back to floats. + min_y = float(min_y) / float(image_height) + min_x = float(min_x) / float(image_width) + max_y = float(max_y) / float(image_height) + max_x = float(max_x) / float(image_width) + + # Clip the bboxes to be sure the fall between [0, 1]. + min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) + min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) + return np.stack([min_y, min_x, max_y, max_x]) + + +def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): + """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. + + Args: + image: 3D uint8 Tensor. + bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox + has 4 elements (min_y, min_x, max_y, max_x) of type float with values + between [0, 1]. + pixels: An int. How many pixels to shift the image and bboxes + replace: A one or three value 1D tensor to fill empty pixels. + shift_horizontal: Boolean. If true then shift in X dimension else shift in + Y dimension. + + Returns: + A tuple containing a 3D uint8 Tensor that will be the result of translating + image by pixels. The second element of the tuple is bboxes, where now + the coordinates will be shifted to reflect the shifted image. + """ + if shift_horizontal: + image = translate_x(image, pixels, replace) + else: + image = translate_y(image, pixels, replace) + + # Convert bbox coordinates to pixel values. + image_height, image_width = image.shape[0], image.shape[1] + # pylint:disable=g-long-lambda + wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal) + # pylint:enable=g-long-lambda + new_bboxes = deepcopy(bboxes) + num_bboxes = len(bboxes) + for idx in range(num_bboxes): + new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx]) + return image.astype(np.uint8), new_bboxes + + +def shear_x(image, level, replace): + """Equivalent of PIL Shearing in X dimension.""" + # Shear parallel to x axis is a projective transform + # with a matrix form of: + # [1 level + # 0 1]. + image = Image.fromarray(wrap(image)) + image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0)) + return unwrap(np.array(image), replace) + + +def shear_y(image, level, replace): + """Equivalent of PIL Shearing in Y dimension.""" + # Shear parallel to y axis is a projective transform + # with a matrix form of: + # [1 0 + # level 1]. + image = Image.fromarray(wrap(image)) + image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0)) + return unwrap(np.array(image), replace) + + +def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): + """Shifts the bbox according to how the image was sheared. + + Args: + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + image_height: Int, height of the image. + image_width: Int, height of the image. + level: Float. How much to shear the image. + shear_horizontal: If true then shear in X dimension else shear in + the Y dimension. + + Returns: + A tensor of the same shape as bbox, but now with the shifted coordinates. + """ + image_height, image_width = (float(image_height), float(image_width)) + + # Change bbox coordinates to be pixels. + min_y = int(image_height * bbox[0]) + min_x = int(image_width * bbox[1]) + max_y = int(image_height * bbox[2]) + max_x = int(image_width * bbox[3]) + coordinates = np.stack( + [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) + coordinates = coordinates.astype(np.float32) + + # Shear the coordinates according to the translation matrix. + if shear_horizontal: + translation_matrix = np.stack([[1, 0], [-level, 1]]) + else: + translation_matrix = np.stack([[1, -level], [0, 1]]) + translation_matrix = translation_matrix.astype(np.float32) + new_coords = np.matmul(translation_matrix, + np.transpose(coordinates)).astype(np.int32) + + # Find min/max values and convert them back to floats. + min_y = float(np.min(new_coords[0, :])) / image_height + min_x = float(np.min(new_coords[1, :])) / image_width + max_y = float(np.max(new_coords[0, :])) / image_height + max_x = float(np.max(new_coords[1, :])) / image_width + + # Clip the bboxes to be sure the fall between [0, 1]. + min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) + min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) + return np.stack([min_y, min_x, max_y, max_x]) + + +def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): + """Applies Shear Transformation to the image and shifts the bboxes. + + Args: + image: 3D uint8 Tensor. + bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox + has 4 elements (min_y, min_x, max_y, max_x) of type float with values + between [0, 1]. + level: Float. How much to shear the image. This value will be between + -0.3 to 0.3. + replace: A one or three value 1D tensor to fill empty pixels. + shear_horizontal: Boolean. If true then shear in X dimension else shear in + the Y dimension. + + Returns: + A tuple containing a 3D uint8 Tensor that will be the result of shearing + image by level. The second element of the tuple is bboxes, where now + the coordinates will be shifted to reflect the sheared image. + """ + if shear_horizontal: + image = shear_x(image, level, replace) + else: + image = shear_y(image, level, replace) + + # Convert bbox coordinates to pixel values. + image_height, image_width = image.shape[:2] + # pylint:disable=g-long-lambda + wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal) + # pylint:enable=g-long-lambda + new_bboxes = deepcopy(bboxes) + num_bboxes = len(bboxes) + for idx in range(num_bboxes): + new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx]) + return image.astype(np.uint8), new_bboxes + + +def autocontrast(image): + """Implements Autocontrast function from PIL. + + Args: + image: A 3D uint8 tensor. + + Returns: + The image after it has had autocontrast applied to it and will be of type + uint8. + """ + + def scale_channel(image): + """Scale the 2D image using the autocontrast rule.""" + # A possibly cheaper version can be done using cumsum/unique_with_counts + # over the histogram values, rather than iterating over the entire image. + # to compute mins and maxes. + lo = float(np.min(image)) + hi = float(np.max(image)) + + # Scale the image, making the lowest value 0 and the highest value 255. + def scale_values(im): + scale = 255.0 / (hi - lo) + offset = -lo * scale + im = im.astype(np.float32) * scale + offset + img = np.clip(im, a_min=0, a_max=255.0) + return im.astype(np.uint8) + + result = scale_values(image) if hi > lo else image + return result + + # Assumes RGB for now. Scales each channel independently + # and then stacks the result. + s1 = scale_channel(image[:, :, 0]) + s2 = scale_channel(image[:, :, 1]) + s3 = scale_channel(image[:, :, 2]) + image = np.stack([s1, s2, s3], 2) + return image + + +def sharpness(image, factor): + """Implements Sharpness function from PIL.""" + orig_image = image + image = image.astype(np.float32) + # Make image 4D for conv operation. + # SMOOTH PIL Kernel. + kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13. + result = cv2.filter2D(image, -1, kernel).astype(np.uint8) + + # Blend the final result. + return blend(result, orig_image, factor) + + +def equalize(image): + """Implements Equalize function from PIL using.""" + + def scale_channel(im, c): + """Scale the data in the channel to implement equalize.""" + im = im[:, :, c].astype(np.int32) + # Compute the histogram of the image channel. + histo, _ = np.histogram(im, range=[0, 255], bins=256) + + # For the purposes of computing the step, filter out the nonzeros. + nonzero = np.where(np.not_equal(histo, 0)) + nonzero_histo = np.reshape(np.take(histo, nonzero), [-1]) + step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 + + def build_lut(histo, step): + # Compute the cumulative sum, shifting by step // 2 + # and then normalization by step. + lut = (np.cumsum(histo) + (step // 2)) // step + # Shift lut, prepending with 0. + lut = np.concatenate([[0], lut[:-1]], 0) + # Clip the counts to be in range. This is done + # in the C code for image.point. + return np.clip(lut, a_min=0, a_max=255).astype(np.uint8) + + # If step is zero, return the original image. Otherwise, build + # lut from the full histogram and step and then index from it. + if step == 0: + result = im + else: + result = np.take(build_lut(histo, step), im) + + return result.astype(np.uint8) + + # Assumes RGB for now. Scales each channel independently + # and then stacks the result. + s1 = scale_channel(image, 0) + s2 = scale_channel(image, 1) + s3 = scale_channel(image, 2) + image = np.stack([s1, s2, s3], 2) + return image + + +def wrap(image): + """Returns 'image' with an extra channel set to all 1s.""" + shape = image.shape + extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype) + extended = np.concatenate([image, extended_channel], 2).astype(image.dtype) + return extended + + +def unwrap(image, replace): + """Unwraps an image produced by wrap. + + Where there is a 0 in the last channel for every spatial position, + the rest of the three channels in that spatial dimension are grayed + (set to 128). Operations like translate and shear on a wrapped + Tensor will leave 0s in empty locations. Some transformations look + at the intensity of values to do preprocessing, and we want these + empty pixels to assume the 'average' value, rather than pure black. + + + Args: + image: A 3D Image Tensor with 4 channels. + replace: A one or three value 1D tensor to fill empty pixels. + + Returns: + image: A 3D image Tensor with 3 channels. + """ + image_shape = image.shape + # Flatten the spatial dimensions. + flattened_image = np.reshape(image, [-1, image_shape[2]]) + + # Find all pixels where the last channel is zero. + alpha_channel = flattened_image[:, 3] + + replace = np.concatenate([replace, np.ones([1], image.dtype)], 0) + + # Where they are zero, fill them in with 'replace'. + alpha_channel = np.reshape(alpha_channel, (-1, 1)) + alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1])) + + flattened_image = np.where( + np.equal(alpha_channel, 0), + np.ones_like( + flattened_image, dtype=image.dtype) * replace, + flattened_image) + + image = np.reshape(flattened_image, image_shape) + image = image[:, :, :3] + return image.astype(np.uint8) + + +def _cutout_inside_bbox(image, bbox, pad_fraction): + """Generates cutout mask and the mean pixel value of the bbox. + + First a location is randomly chosen within the image as the center where the + cutout mask will be applied. Note this can be towards the boundaries of the + image, so the full cutout mask may not be applied. + + Args: + image: 3D uint8 Tensor. + bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) + of type float that represents the normalized coordinates between 0 and 1. + pad_fraction: Float that specifies how large the cutout mask should be in + in reference to the size of the original bbox. If pad_fraction is 0.25, + then the cutout mask will be of shape + (0.25 * bbox height, 0.25 * bbox width). + + Returns: + A tuple. Fist element is a tensor of the same shape as image where each + element is either a 1 or 0 that is used to determine where the image + will have cutout applied. The second element is the mean of the pixels + in the image where the bbox is located. + mask value: [0,1] + """ + image_height, image_width = image.shape[0], image.shape[1] + # Transform from shape [1, 4] to [4]. + bbox = np.squeeze(bbox) + + min_y = int(float(image_height) * bbox[0]) + min_x = int(float(image_width) * bbox[1]) + max_y = int(float(image_height) * bbox[2]) + max_x = int(float(image_width) * bbox[3]) + + # Calculate the mean pixel values in the bounding box, which will be used + # to fill the cutout region. + mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1)) + # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the + # region lies entirely within the bbox. + box_height = max_y - min_y + 1 + box_width = max_x - min_x + 1 + pad_size_height = int(pad_fraction * (box_height / 2)) + pad_size_width = int(pad_fraction * (box_width / 2)) + + # Sample the center location in the image where the zero mask will be applied. + cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32) + cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32) + + lower_pad = np.maximum(0, cutout_center_height - pad_size_height) + upper_pad = np.maximum( + 0, image_height - cutout_center_height - pad_size_height) + left_pad = np.maximum(0, cutout_center_width - pad_size_width) + right_pad = np.maximum(0, + image_width - cutout_center_width - pad_size_width) + + cutout_shape = [ + image_height - (lower_pad + upper_pad), + image_width - (left_pad + right_pad) + ] + padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] + + mask = np.pad(np.zeros( + cutout_shape, dtype=image.dtype), + padding_dims, + 'constant', + constant_values=1) + + mask = np.expand_dims(mask, 2) + mask = np.tile(mask, [1, 1, 3]) + return mask, mean + + +def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean): + """Applies cutout to the image according to bbox information. + + This is a cutout variant that using bbox information to make more informed + decisions on where to place the cutout mask. + + Args: + image: 3D uint8 Tensor. + bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox + has 4 elements (min_y, min_x, max_y, max_x) of type float with values + between [0, 1]. + pad_fraction: Float that specifies how large the cutout mask should be in + in reference to the size of the original bbox. If pad_fraction is 0.25, + then the cutout mask will be of shape + (0.25 * bbox height, 0.25 * bbox width). + replace_with_mean: Boolean that specified what value should be filled in + where the cutout mask is applied. Since the incoming image will be of + uint8 and will not have had any mean normalization applied, by default + we set the value to be 128. If replace_with_mean is True then we find + the mean pixel values across the channel dimension and use those to fill + in where the cutout mask is applied. + + Returns: + A tuple. First element is a tensor of the same shape as image that has + cutout applied to it. Second element is the bboxes that were passed in + that will be unchanged. + """ + + def apply_bbox_cutout(image, bboxes, pad_fraction): + """Applies cutout to a single bounding box within image.""" + # Choose a single bounding box to apply cutout to. + random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32) + # Select the corresponding bbox and apply cutout. + chosen_bbox = np.take(bboxes, random_index, axis=0) + mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction) + + # When applying cutout we either set the pixel value to 128 or to the mean + # value inside the bbox. + replace = mean if replace_with_mean else [128] * 3 + + # Apply the cutout mask to the image. Where the mask is 0 we fill it with + # `replace`. + image = np.where( + np.equal(mask, 0), + np.ones_like( + image, dtype=image.dtype) * replace, + image).astype(image.dtype) + return image + + # Check to see if there are boxes, if so then apply boxcutout. + if len(bboxes) != 0: + image = apply_bbox_cutout(image, bboxes, pad_fraction) + + return image, bboxes + + +NAME_TO_FUNC = { + 'AutoContrast': autocontrast, + 'Equalize': equalize, + 'Posterize': posterize, + 'Solarize': solarize, + 'SolarizeAdd': solarize_add, + 'Color': color, + 'Contrast': contrast, + 'Brightness': brightness, + 'Sharpness': sharpness, + 'Cutout': cutout, + 'BBox_Cutout': bbox_cutout, + 'Rotate_BBox': rotate_with_bboxes, + # pylint:disable=g-long-lambda + 'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( + image, bboxes, pixels, replace, shift_horizontal=True), + 'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( + image, bboxes, pixels, replace, shift_horizontal=False), + 'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( + image, bboxes, level, replace, shear_horizontal=True), + 'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( + image, bboxes, level, replace, shear_horizontal=False), + # pylint:enable=g-long-lambda + 'Rotate_Only_BBoxes': rotate_only_bboxes, + 'ShearX_Only_BBoxes': shear_x_only_bboxes, + 'ShearY_Only_BBoxes': shear_y_only_bboxes, + 'TranslateX_Only_BBoxes': translate_x_only_bboxes, + 'TranslateY_Only_BBoxes': translate_y_only_bboxes, + 'Flip_Only_BBoxes': flip_only_bboxes, + 'Solarize_Only_BBoxes': solarize_only_bboxes, + 'Equalize_Only_BBoxes': equalize_only_bboxes, + 'Cutout_Only_BBoxes': cutout_only_bboxes, +} + + +def _randomly_negate_tensor(tensor): + """With 50% prob turn the tensor negative.""" + should_flip = np.floor(np.random.rand() + 0.5) >= 1 + final_tensor = tensor if should_flip else -tensor + return final_tensor + + +def _rotate_level_to_arg(level): + level = (level / _MAX_LEVEL) * 30. + level = _randomly_negate_tensor(level) + return (level, ) + + +def _shrink_level_to_arg(level): + """Converts level to ratio by which we shrink the image content.""" + if level == 0: + return (1.0, ) # if level is zero, do not shrink the image + # Maximum shrinking ratio is 2.9. + level = 2. / (_MAX_LEVEL / level) + 0.9 + return (level, ) + + +def _enhance_level_to_arg(level): + return ((level / _MAX_LEVEL) * 1.8 + 0.1, ) + + +def _shear_level_to_arg(level): + level = (level / _MAX_LEVEL) * 0.3 + # Flip level to negative with 50% chance. + level = _randomly_negate_tensor(level) + return (level, ) + + +def _translate_level_to_arg(level, translate_const): + level = (level / _MAX_LEVEL) * float(translate_const) + # Flip level to negative with 50% chance. + level = _randomly_negate_tensor(level) + return (level, ) + + +def _bbox_cutout_level_to_arg(level, hparams): + cutout_pad_fraction = (level / + _MAX_LEVEL) * 0.75 # hparams.cutout_max_pad_fraction + return (cutout_pad_fraction, False) # hparams.cutout_bbox_replace_with_mean + + +def level_to_arg(hparams): + return { + 'AutoContrast': lambda level: (), + 'Equalize': lambda level: (), + 'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ), + 'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ), + 'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ), + 'Color': _enhance_level_to_arg, + 'Contrast': _enhance_level_to_arg, + 'Brightness': _enhance_level_to_arg, + 'Sharpness': _enhance_level_to_arg, + 'Cutout': + lambda level: (int((level / _MAX_LEVEL) * 100), ), # hparams.cutout_const=100 + # pylint:disable=g-long-lambda + 'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams), + 'TranslateX_BBox': + lambda level: _translate_level_to_arg(level, 250), # hparams.translate_const=250 + 'TranslateY_BBox': + lambda level: _translate_level_to_arg(level, 250), # hparams.translate_cons + # pylint:enable=g-long-lambda + 'ShearX_BBox': _shear_level_to_arg, + 'ShearY_BBox': _shear_level_to_arg, + 'Rotate_BBox': _rotate_level_to_arg, + 'Rotate_Only_BBoxes': _rotate_level_to_arg, + 'ShearX_Only_BBoxes': _shear_level_to_arg, + 'ShearY_Only_BBoxes': _shear_level_to_arg, + # pylint:disable=g-long-lambda + 'TranslateX_Only_BBoxes': + lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const + 'TranslateY_Only_BBoxes': + lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const + # pylint:enable=g-long-lambda + 'Flip_Only_BBoxes': lambda level: (), + 'Solarize_Only_BBoxes': + lambda level: (int((level / _MAX_LEVEL) * 256), ), + 'Equalize_Only_BBoxes': lambda level: (), + # pylint:disable=g-long-lambda + 'Cutout_Only_BBoxes': + lambda level: (int((level / _MAX_LEVEL) * 50), ), # hparams.cutout_bbox_const + # pylint:enable=g-long-lambda + } + + +def bbox_wrapper(func): + """Adds a bboxes function argument to func and returns unchanged bboxes.""" + + def wrapper(images, bboxes, *args, **kwargs): + return (func(images, *args, **kwargs), bboxes) + + return wrapper + + +def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams): + """Return the function that corresponds to `name` and update `level` param.""" + func = NAME_TO_FUNC[name] + args = level_to_arg(augmentation_hparams)[name](level) + + # Check to see if prob is passed into function. This is used for operations + # where we alter bboxes independently. + # pytype:disable=wrong-arg-types + if 'prob' in inspect.getfullargspec(func)[0]: + args = tuple([prob] + list(args)) + # pytype:enable=wrong-arg-types + + # Add in replace arg if it is required for the function that is being called. + if 'replace' in inspect.getfullargspec(func)[0]: + # Make sure replace is the final argument + assert 'replace' == inspect.getfullargspec(func)[0][-1] + args = tuple(list(args) + [replace_value]) + + # Add bboxes as the second positional argument for the function if it does + # not already exist. + if 'bboxes' not in inspect.getfullargspec(func)[0]: + func = bbox_wrapper(func) + return (func, prob, args) + + +def _apply_func_with_prob(func, image, args, prob, bboxes): + """Apply `func` to image w/ `args` as input with probability `prob`.""" + assert isinstance(args, tuple) + assert 'bboxes' == inspect.getfullargspec(func)[0][1] + + # If prob is a function argument, then this randomness is being handled + # inside the function, so make sure it is always called. + if 'prob' in inspect.getfullargspec(func)[0]: + prob = 1.0 + + # Apply the function with probability `prob`. + should_apply_op = np.floor(np.random.rand() + 0.5) >= 1 + if should_apply_op: + augmented_image, augmented_bboxes = func(image, bboxes, *args) + else: + augmented_image, augmented_bboxes = (image, bboxes) + return augmented_image, augmented_bboxes + + +def select_and_apply_random_policy(policies, image, bboxes): + """Select a random policy from `policies` and apply it to `image`.""" + policy_to_select = np.random.randint(0, len(policies), dtype=np.int32) + # policy_to_select = 6 # for test + for (i, policy) in enumerate(policies): + if i == policy_to_select: + image, bboxes = policy(image, bboxes) + return (image, bboxes) + + +def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams): + """Build a policy from the given policies passed in and apply to image. + + Args: + policies: list of lists of tuples in the form `(func, prob, level)`, `func` + is a string name of the augmentation function, `prob` is the probability + of applying the `func` operation, `level` is the input argument for + `func`. + image: numpy array that the resulting policy will be applied to. + bboxes: + augmentation_hparams: Hparams associated with the NAS learned policy. + + Returns: + A version of image that now has data augmentation applied to it based on + the `policies` pass into the function. Additionally, returns bboxes if + a value for them is passed in that is not None + """ + replace_value = [128, 128, 128] + + # func is the string name of the augmentation function, prob is the + # probability of applying the operation and level is the parameter associated + + # tf_policies are functions that take in an image and return an augmented + # image. + tf_policies = [] + for policy in policies: + tf_policy = [] + # Link string name to the correct python function and make sure the correct + # argument is passed into that function. + for policy_info in policy: + policy_info = list( + policy_info) + [replace_value, augmentation_hparams] + + tf_policy.append(_parse_policy_info(*policy_info)) + # Now build the tf policy that will apply the augmentation procedue + # on image. + def make_final_policy(tf_policy_): + def final_policy(image_, bboxes_): + for func, prob, args in tf_policy_: + image_, bboxes_ = _apply_func_with_prob(func, image_, args, + prob, bboxes_) + return image_, bboxes_ + + return final_policy + + tf_policies.append(make_final_policy(tf_policy)) + + augmented_images, augmented_bboxes = select_and_apply_random_policy( + tf_policies, image, bboxes) + # If no bounding boxes were specified, then just return the images. + return (augmented_images, augmented_bboxes) + + +# TODO(barretzoph): Add in ArXiv link once paper is out. +def distort_image_with_autoaugment(image, bboxes, augmentation_name): + """Applies the AutoAugment policy to `image` and `bboxes`. + + Args: + image: `Tensor` of shape [height, width, 3] representing an image. + bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are + normalized between [0, 1]. + augmentation_name: The name of the AutoAugment policy to use. The available + options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for + all of the results in the paper and was found to achieve the best results + on the COCO dataset. `v1`, `v2` and `v3` are additional good policies + found on the COCO dataset that have slight variation in what operations + were used during the search procedure along with how many operations are + applied in parallel to a single image (2 vs 3). + + Returns: + A tuple containing the augmented versions of `image` and `bboxes`. + """ + available_policies = { + 'v0': policy_v0, + 'v1': policy_v1, + 'v2': policy_v2, + 'v3': policy_v3, + 'test': policy_vtest + } + if augmentation_name not in available_policies: + raise ValueError('Invalid augmentation_name: {}'.format( + augmentation_name)) + + policy = available_policies[augmentation_name]() + augmentation_hparams = {} + return build_and_apply_nas_policy(policy, image, bboxes, + augmentation_hparams) diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py new file mode 100644 index 0000000..3ae8477 --- /dev/null +++ b/ppdet/data/transform/batch_operators.py @@ -0,0 +1,904 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +import cv2 +import numpy as np +from .operators import register_op, BaseOperator, Resize +from .op_helper import jaccard_overlap, gaussian2D +from scipy import ndimage + +from ppdet.modeling import bbox_utils +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget', + 'Gt2TTFTarget', 'Gt2Solov2Target', 'RboxPadBatch' +] + + +@register_op +class PadBatch(BaseOperator): + """ + Pad a batch of samples so they can be divisible by a stride. + The layout of each image should be 'CHW'. + Args: + pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure + height and width is divisible by `pad_to_stride`. + """ + + def __init__(self, pad_to_stride=0, pad_gt=False): + super(PadBatch, self).__init__() + self.pad_to_stride = pad_to_stride + self.pad_gt = pad_gt + + def __call__(self, samples, context=None): + """ + Args: + samples (list): a batch of sample, each is dict. + """ + coarsest_stride = self.pad_to_stride + + max_shape = np.array([data['image'].shape for data in samples]).max( + axis=0) + if coarsest_stride > 0: + max_shape[1] = int( + np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) + max_shape[2] = int( + np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) + + padding_batch = [] + for data in samples: + im = data['image'] + im_c, im_h, im_w = im.shape[:] + padding_im = np.zeros( + (im_c, max_shape[1], max_shape[2]), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + data['image'] = padding_im + if 'semantic' in data and data['semantic'] is not None: + semantic = data['semantic'] + padding_sem = np.zeros( + (1, max_shape[1], max_shape[2]), dtype=np.float32) + padding_sem[:, :im_h, :im_w] = semantic + data['semantic'] = padding_sem + if 'gt_segm' in data and data['gt_segm'] is not None: + gt_segm = data['gt_segm'] + padding_segm = np.zeros( + (gt_segm.shape[0], max_shape[1], max_shape[2]), + dtype=np.uint8) + padding_segm[:, :im_h, :im_w] = gt_segm + data['gt_segm'] = padding_segm + + if self.pad_gt: + gt_num = [] + if 'gt_poly' in data and data['gt_poly'] is not None and len(data[ + 'gt_poly']) > 0: + pad_mask = True + else: + pad_mask = False + + if pad_mask: + poly_num = [] + poly_part_num = [] + point_num = [] + for data in samples: + gt_num.append(data['gt_bbox'].shape[0]) + if pad_mask: + poly_num.append(len(data['gt_poly'])) + for poly in data['gt_poly']: + poly_part_num.append(int(len(poly))) + for p_p in poly: + point_num.append(int(len(p_p) / 2)) + gt_num_max = max(gt_num) + + for i, data in enumerate(samples): + gt_box_data = -np.ones([gt_num_max, 4], dtype=np.float32) + gt_class_data = -np.ones([gt_num_max], dtype=np.int32) + is_crowd_data = np.ones([gt_num_max], dtype=np.int32) + difficult_data = np.ones([gt_num_max], dtype=np.int32) + + if pad_mask: + poly_num_max = max(poly_num) + poly_part_num_max = max(poly_part_num) + point_num_max = max(point_num) + gt_masks_data = -np.ones( + [poly_num_max, poly_part_num_max, point_num_max, 2], + dtype=np.float32) + + gt_num = data['gt_bbox'].shape[0] + gt_box_data[0:gt_num, :] = data['gt_bbox'] + gt_class_data[0:gt_num] = np.squeeze(data['gt_class']) + if 'is_crowd' in data: + is_crowd_data[0:gt_num] = np.squeeze(data['is_crowd']) + data['is_crowd'] = is_crowd_data + if 'difficult' in data: + difficult_data[0:gt_num] = np.squeeze(data['difficult']) + data['difficult'] = difficult_data + if pad_mask: + for j, poly in enumerate(data['gt_poly']): + for k, p_p in enumerate(poly): + pp_np = np.array(p_p).reshape(-1, 2) + gt_masks_data[j, k, :pp_np.shape[0], :] = pp_np + data['gt_poly'] = gt_masks_data + data['gt_bbox'] = gt_box_data + data['gt_class'] = gt_class_data + + return samples + + +@register_op +class BatchRandomResize(BaseOperator): + """ + Resize image to target size randomly. random target_size and interpolation method + Args: + target_size (int, list, tuple): image target size, if random size is True, must be list or tuple + keep_ratio (bool): whether keep_raio or not, default true + interp (int): the interpolation method + random_size (bool): whether random select target size of image + random_interp (bool): whether random select interpolation method + """ + + def __init__(self, + target_size, + keep_ratio, + interp=cv2.INTER_NEAREST, + random_size=True, + random_interp=False): + super(BatchRandomResize, self).__init__() + self.keep_ratio = keep_ratio + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + self.interp = interp + assert isinstance(target_size, ( + int, Sequence)), "target_size must be int, list or tuple" + if random_size and not isinstance(target_size, list): + raise TypeError( + "Type of target_size is invalid when random_size is True. Must be List, now is {}". + format(type(target_size))) + self.target_size = target_size + self.random_size = random_size + self.random_interp = random_interp + + def __call__(self, samples, context=None): + if self.random_size: + target_size = np.random.choice(self.target_size) + else: + target_size = self.target_size + + if self.random_interp: + interp = np.random.choice(self.interps) + else: + interp = self.interp + + resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) + return resizer(samples, context=context) + + +@register_op +class Gt2YoloTarget(BaseOperator): + """ + Generate YOLOv3 targets by groud truth data, this operator is only used in + fine grained YOLOv3 loss mode + """ + + def __init__(self, + anchors, + anchor_masks, + downsample_ratios, + num_classes=80, + iou_thresh=1.): + super(Gt2YoloTarget, self).__init__() + self.anchors = anchors + self.anchor_masks = anchor_masks + self.downsample_ratios = downsample_ratios + self.num_classes = num_classes + self.iou_thresh = iou_thresh + + def __call__(self, samples, context=None): + assert len(self.anchor_masks) == len(self.downsample_ratios), \ + "anchor_masks', and 'downsample_ratios' should have same length." + + h, w = samples[0]['image'].shape[1:3] + an_hw = np.array(self.anchors) / np.array([[w, h]]) + for sample in samples: + # im, gt_bbox, gt_class, gt_score = sample + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + if 'gt_score' not in sample: + sample['gt_score'] = np.ones( + (gt_bbox.shape[0], 1), dtype=np.float32) + gt_score = sample['gt_score'] + for i, ( + mask, downsample_ratio + ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)): + grid_h = int(h / downsample_ratio) + grid_w = int(w / downsample_ratio) + target = np.zeros( + (len(mask), 6 + self.num_classes, grid_h, grid_w), + dtype=np.float32) + for b in range(gt_bbox.shape[0]): + gx, gy, gw, gh = gt_bbox[b, :] + cls = gt_class[b] + score = gt_score[b] + if gw <= 0. or gh <= 0. or score <= 0.: + continue + + # find best match anchor index + best_iou = 0. + best_idx = -1 + for an_idx in range(an_hw.shape[0]): + iou = jaccard_overlap( + [0., 0., gw, gh], + [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) + if iou > best_iou: + best_iou = iou + best_idx = an_idx + + gi = int(gx * grid_w) + gj = int(gy * grid_h) + + # gtbox should be regresed in this layes if best match + # anchor index in anchor mask of this layer + if best_idx in mask: + best_n = mask.index(best_idx) + + # x, y, w, h, scale + target[best_n, 0, gj, gi] = gx * grid_w - gi + target[best_n, 1, gj, gi] = gy * grid_h - gj + target[best_n, 2, gj, gi] = np.log( + gw * w / self.anchors[best_idx][0]) + target[best_n, 3, gj, gi] = np.log( + gh * h / self.anchors[best_idx][1]) + target[best_n, 4, gj, gi] = 2.0 - gw * gh + + # objectness record gt_score + target[best_n, 5, gj, gi] = score + + # classification + target[best_n, 6 + cls, gj, gi] = 1. + + # For non-matched anchors, calculate the target if the iou + # between anchor and gt is larger than iou_thresh + if self.iou_thresh < 1: + for idx, mask_i in enumerate(mask): + if mask_i == best_idx: continue + iou = jaccard_overlap( + [0., 0., gw, gh], + [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]]) + if iou > self.iou_thresh and target[idx, 5, gj, + gi] == 0.: + # x, y, w, h, scale + target[idx, 0, gj, gi] = gx * grid_w - gi + target[idx, 1, gj, gi] = gy * grid_h - gj + target[idx, 2, gj, gi] = np.log( + gw * w / self.anchors[mask_i][0]) + target[idx, 3, gj, gi] = np.log( + gh * h / self.anchors[mask_i][1]) + target[idx, 4, gj, gi] = 2.0 - gw * gh + + # objectness record gt_score + target[idx, 5, gj, gi] = score + + # classification + target[idx, 6 + cls, gj, gi] = 1. + sample['target{}'.format(i)] = target + + # remove useless gt_class and gt_score after target calculated + sample.pop('gt_class') + sample.pop('gt_score') + + return samples + + +@register_op +class Gt2FCOSTarget(BaseOperator): + """ + Generate FCOS targets by groud truth data + """ + + def __init__(self, + object_sizes_boundary, + center_sampling_radius, + downsample_ratios, + norm_reg_targets=False): + super(Gt2FCOSTarget, self).__init__() + self.center_sampling_radius = center_sampling_radius + self.downsample_ratios = downsample_ratios + self.INF = np.inf + self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF] + object_sizes_of_interest = [] + for i in range(len(self.object_sizes_boundary) - 1): + object_sizes_of_interest.append([ + self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1] + ]) + self.object_sizes_of_interest = object_sizes_of_interest + self.norm_reg_targets = norm_reg_targets + + def _compute_points(self, w, h): + """ + compute the corresponding points in each feature map + :param h: image height + :param w: image width + :return: points from all feature map + """ + locations = [] + for stride in self.downsample_ratios: + shift_x = np.arange(0, w, stride).astype(np.float32) + shift_y = np.arange(0, h, stride).astype(np.float32) + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shift_x = shift_x.flatten() + shift_y = shift_y.flatten() + location = np.stack([shift_x, shift_y], axis=1) + stride // 2 + locations.append(location) + num_points_each_level = [len(location) for location in locations] + locations = np.concatenate(locations, axis=0) + return locations, num_points_each_level + + def _convert_xywh2xyxy(self, gt_bbox, w, h): + """ + convert the bounding box from style xywh to xyxy + :param gt_bbox: bounding boxes normalized into [0, 1] + :param w: image width + :param h: image height + :return: bounding boxes in xyxy style + """ + bboxes = gt_bbox.copy() + bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w + bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h + bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] + bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] + return bboxes + + def _check_inside_boxes_limited(self, gt_bbox, xs, ys, + num_points_each_level): + """ + check if points is within the clipped boxes + :param gt_bbox: bounding boxes + :param xs: horizontal coordinate of points + :param ys: vertical coordinate of points + :return: the mask of points is within gt_box or not + """ + bboxes = np.reshape( + gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]]) + bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1]) + ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2 + ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2 + beg = 0 + clipped_box = bboxes.copy() + for lvl, stride in enumerate(self.downsample_ratios): + end = beg + num_points_each_level[lvl] + stride_exp = self.center_sampling_radius * stride + clipped_box[beg:end, :, 0] = np.maximum( + bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp) + clipped_box[beg:end, :, 1] = np.maximum( + bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp) + clipped_box[beg:end, :, 2] = np.minimum( + bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp) + clipped_box[beg:end, :, 3] = np.minimum( + bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp) + beg = end + l_res = xs - clipped_box[:, :, 0] + r_res = clipped_box[:, :, 2] - xs + t_res = ys - clipped_box[:, :, 1] + b_res = clipped_box[:, :, 3] - ys + clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) + inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0 + return inside_gt_box + + def __call__(self, samples, context=None): + assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \ + "object_sizes_of_interest', and 'downsample_ratios' should have same length." + + for sample in samples: + # im, gt_bbox, gt_class, gt_score = sample + im = sample['image'] + bboxes = sample['gt_bbox'] + gt_class = sample['gt_class'] + # calculate the locations + h, w = im.shape[1:3] + points, num_points_each_level = self._compute_points(w, h) + object_scale_exp = [] + for i, num_pts in enumerate(num_points_each_level): + object_scale_exp.append( + np.tile( + np.array([self.object_sizes_of_interest[i]]), + reps=[num_pts, 1])) + object_scale_exp = np.concatenate(object_scale_exp, axis=0) + + gt_area = (bboxes[:, 2] - bboxes[:, 0]) * ( + bboxes[:, 3] - bboxes[:, 1]) + xs, ys = points[:, 0], points[:, 1] + xs = np.reshape(xs, newshape=[xs.shape[0], 1]) + xs = np.tile(xs, reps=[1, bboxes.shape[0]]) + ys = np.reshape(ys, newshape=[ys.shape[0], 1]) + ys = np.tile(ys, reps=[1, bboxes.shape[0]]) + + l_res = xs - bboxes[:, 0] + r_res = bboxes[:, 2] - xs + t_res = ys - bboxes[:, 1] + b_res = bboxes[:, 3] - ys + reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) + if self.center_sampling_radius > 0: + is_inside_box = self._check_inside_boxes_limited( + bboxes, xs, ys, num_points_each_level) + else: + is_inside_box = np.min(reg_targets, axis=2) > 0 + # check if the targets is inside the corresponding level + max_reg_targets = np.max(reg_targets, axis=2) + lower_bound = np.tile( + np.expand_dims( + object_scale_exp[:, 0], axis=1), + reps=[1, max_reg_targets.shape[1]]) + high_bound = np.tile( + np.expand_dims( + object_scale_exp[:, 1], axis=1), + reps=[1, max_reg_targets.shape[1]]) + is_match_current_level = \ + (max_reg_targets > lower_bound) & \ + (max_reg_targets < high_bound) + points2gtarea = np.tile( + np.expand_dims( + gt_area, axis=0), reps=[xs.shape[0], 1]) + points2gtarea[is_inside_box == 0] = self.INF + points2gtarea[is_match_current_level == 0] = self.INF + points2min_area = points2gtarea.min(axis=1) + points2min_area_ind = points2gtarea.argmin(axis=1) + labels = gt_class[points2min_area_ind] + 1 + labels[points2min_area == self.INF] = 0 + reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind] + ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \ + reg_targets[:, [0, 2]].max(axis=1)) * \ + (reg_targets[:, [1, 3]].min(axis=1) / \ + reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32) + ctn_targets = np.reshape( + ctn_targets, newshape=[ctn_targets.shape[0], 1]) + ctn_targets[labels <= 0] = 0 + pos_ind = np.nonzero(labels != 0) + reg_targets_pos = reg_targets[pos_ind[0], :] + split_sections = [] + beg = 0 + for lvl in range(len(num_points_each_level)): + end = beg + num_points_each_level[lvl] + split_sections.append(end) + beg = end + labels_by_level = np.split(labels, split_sections, axis=0) + reg_targets_by_level = np.split(reg_targets, split_sections, axis=0) + ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0) + for lvl in range(len(self.downsample_ratios)): + grid_w = int(np.ceil(w / self.downsample_ratios[lvl])) + grid_h = int(np.ceil(h / self.downsample_ratios[lvl])) + if self.norm_reg_targets: + sample['reg_target{}'.format(lvl)] = \ + np.reshape( + reg_targets_by_level[lvl] / \ + self.downsample_ratios[lvl], + newshape=[grid_h, grid_w, 4]) + else: + sample['reg_target{}'.format(lvl)] = np.reshape( + reg_targets_by_level[lvl], + newshape=[grid_h, grid_w, 4]) + sample['labels{}'.format(lvl)] = np.reshape( + labels_by_level[lvl], newshape=[grid_h, grid_w, 1]) + sample['centerness{}'.format(lvl)] = np.reshape( + ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1]) + + sample.pop('is_crowd') + sample.pop('gt_class') + sample.pop('gt_bbox') + return samples + + +@register_op +class Gt2TTFTarget(BaseOperator): + __shared__ = ['num_classes'] + """ + Gt2TTFTarget + Generate TTFNet targets by ground truth data + + Args: + num_classes(int): the number of classes. + down_ratio(int): the down ratio from images to heatmap, 4 by default. + alpha(float): the alpha parameter to generate gaussian target. + 0.54 by default. + """ + + def __init__(self, num_classes=80, down_ratio=4, alpha=0.54): + super(Gt2TTFTarget, self).__init__() + self.down_ratio = down_ratio + self.num_classes = num_classes + self.alpha = alpha + + def __call__(self, samples, context=None): + output_size = samples[0]['image'].shape[1] + feat_size = output_size // self.down_ratio + for sample in samples: + heatmap = np.zeros( + (self.num_classes, feat_size, feat_size), dtype='float32') + box_target = np.ones( + (4, feat_size, feat_size), dtype='float32') * -1 + reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32') + + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + + bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1 + bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1 + area = bbox_w * bbox_h + boxes_areas_log = np.log(area) + boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1] + boxes_area_topk_log = boxes_areas_log[boxes_ind] + gt_bbox = gt_bbox[boxes_ind] + gt_class = gt_class[boxes_ind] + + feat_gt_bbox = gt_bbox / self.down_ratio + feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1) + feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1], + feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0]) + + ct_inds = np.stack( + [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2, + (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2], + axis=1) / self.down_ratio + + h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32') + w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32') + + for k in range(len(gt_bbox)): + cls_id = gt_class[k] + fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32') + self.draw_truncate_gaussian(fake_heatmap, ct_inds[k], + h_radiuses_alpha[k], + w_radiuses_alpha[k]) + + heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap) + box_target_inds = fake_heatmap > 0 + box_target[:, box_target_inds] = gt_bbox[k][:, None] + + local_heatmap = fake_heatmap[box_target_inds] + ct_div = np.sum(local_heatmap) + local_heatmap *= boxes_area_topk_log[k] + reg_weight[0, box_target_inds] = local_heatmap / ct_div + sample['ttf_heatmap'] = heatmap + sample['ttf_box_target'] = box_target + sample['ttf_reg_weight'] = reg_weight + sample.pop('is_crowd') + sample.pop('gt_class') + sample.pop('gt_bbox') + if 'gt_score' in sample: + sample.pop('gt_score') + return samples + + def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius): + h, w = 2 * h_radius + 1, 2 * w_radius + 1 + sigma_x = w / 6 + sigma_y = h / 6 + gaussian = gaussian2D((h, w), sigma_x, sigma_y) + + x, y = int(center[0]), int(center[1]) + + height, width = heatmap.shape[0:2] + + left, right = min(x, w_radius), min(width - x, w_radius + 1) + top, bottom = min(y, h_radius), min(height - y, h_radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius - + left:w_radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: + heatmap[y - top:y + bottom, x - left:x + right] = np.maximum( + masked_heatmap, masked_gaussian) + return heatmap + + +@register_op +class Gt2Solov2Target(BaseOperator): + """Assign mask target and labels in SOLOv2 network. + Args: + num_grids (list): The list of feature map grids size. + scale_ranges (list): The list of mask boundary range. + coord_sigma (float): The coefficient of coordinate area length. + sampling_ratio (float): The ratio of down sampling. + """ + + def __init__(self, + num_grids=[40, 36, 24, 16, 12], + scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768], + [384, 2048]], + coord_sigma=0.2, + sampling_ratio=4.0): + super(Gt2Solov2Target, self).__init__() + self.num_grids = num_grids + self.scale_ranges = scale_ranges + self.coord_sigma = coord_sigma + self.sampling_ratio = sampling_ratio + + def _scale_size(self, im, scale): + h, w = im.shape[:2] + new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)) + resized_img = cv2.resize( + im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) + return resized_img + + def __call__(self, samples, context=None): + sample_id = 0 + max_ins_num = [0] * len(self.num_grids) + for sample in samples: + gt_bboxes_raw = sample['gt_bbox'] + gt_labels_raw = sample['gt_class'] + 1 + im_c, im_h, im_w = sample['image'].shape[:] + gt_masks_raw = sample['gt_segm'].astype(np.uint8) + mask_feat_size = [ + int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio) + ] + gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) * + (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1])) + ins_ind_label_list = [] + idx = 0 + for (lower_bound, upper_bound), num_grid \ + in zip(self.scale_ranges, self.num_grids): + + hit_indices = ((gt_areas >= lower_bound) & + (gt_areas <= upper_bound)).nonzero()[0] + num_ins = len(hit_indices) + + ins_label = [] + grid_order = [] + cate_label = np.zeros([num_grid, num_grid], dtype=np.int64) + ins_ind_label = np.zeros([num_grid**2], dtype=np.bool) + + if num_ins == 0: + ins_label = np.zeros( + [1, mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray( + [sample_id * num_grid * num_grid + 0], dtype=np.int32) + idx += 1 + continue + gt_bboxes = gt_bboxes_raw[hit_indices] + gt_labels = gt_labels_raw[hit_indices] + gt_masks = gt_masks_raw[hit_indices, ...] + + half_ws = 0.5 * ( + gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma + half_hs = 0.5 * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma + + for seg_mask, gt_label, half_h, half_w in zip( + gt_masks, gt_labels, half_hs, half_ws): + if seg_mask.sum() == 0: + continue + # mass center + upsampled_size = (mask_feat_size[0] * 4, + mask_feat_size[1] * 4) + center_h, center_w = ndimage.measurements.center_of_mass( + seg_mask) + coord_w = int( + (center_w / upsampled_size[1]) // (1. / num_grid)) + coord_h = int( + (center_h / upsampled_size[0]) // (1. / num_grid)) + + # left, top, right, down + top_box = max(0, + int(((center_h - half_h) / upsampled_size[0]) + // (1. / num_grid))) + down_box = min(num_grid - 1, + int(((center_h + half_h) / upsampled_size[0]) + // (1. / num_grid))) + left_box = max(0, + int(((center_w - half_w) / upsampled_size[1]) + // (1. / num_grid))) + right_box = min(num_grid - 1, + int(((center_w + half_w) / + upsampled_size[1]) // (1. / num_grid))) + + top = max(top_box, coord_h - 1) + down = min(down_box, coord_h + 1) + left = max(coord_w - 1, left_box) + right = min(right_box, coord_w + 1) + + cate_label[top:(down + 1), left:(right + 1)] = gt_label + seg_mask = self._scale_size( + seg_mask, scale=1. / self.sampling_ratio) + for i in range(top, down + 1): + for j in range(left, right + 1): + label = int(i * num_grid + j) + cur_ins_label = np.zeros( + [mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[ + 1]] = seg_mask + ins_label.append(cur_ins_label) + ins_ind_label[label] = True + grid_order.append(sample_id * num_grid * num_grid + + label) + if ins_label == []: + ins_label = np.zeros( + [1, mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray( + [sample_id * num_grid * num_grid + 0], dtype=np.int32) + else: + ins_label = np.stack(ins_label, axis=0) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray( + grid_order, dtype=np.int32) + assert len(grid_order) > 0 + max_ins_num[idx] = max( + max_ins_num[idx], + sample['ins_label{}'.format(idx)].shape[0]) + idx += 1 + ins_ind_labels = np.concatenate([ + ins_ind_labels_level_img + for ins_ind_labels_level_img in ins_ind_label_list + ]) + fg_num = np.sum(ins_ind_labels) + sample['fg_num'] = fg_num + sample_id += 1 + + sample.pop('is_crowd') + sample.pop('gt_class') + sample.pop('gt_bbox') + sample.pop('gt_poly') + sample.pop('gt_segm') + + # padding batch + for data in samples: + for idx in range(len(self.num_grids)): + gt_ins_data = np.zeros( + [ + max_ins_num[idx], + data['ins_label{}'.format(idx)].shape[1], + data['ins_label{}'.format(idx)].shape[2] + ], + dtype=np.uint8) + gt_ins_data[0:data['ins_label{}'.format(idx)].shape[ + 0], :, :] = data['ins_label{}'.format(idx)] + gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32) + gt_grid_order[0:data['grid_order{}'.format(idx)].shape[ + 0]] = data['grid_order{}'.format(idx)] + data['ins_label{}'.format(idx)] = gt_ins_data + data['grid_order{}'.format(idx)] = gt_grid_order + + return samples + + +@register_op +class RboxPadBatch(BaseOperator): + """ + Pad a batch of samples so they can be divisible by a stride. + The layout of each image should be 'CHW'. And convert poly to rbox. + Args: + pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure + height and width is divisible by `pad_to_stride`. + """ + + def __init__(self, pad_to_stride=0, pad_gt=False): + super(RboxPadBatch, self).__init__() + self.pad_to_stride = pad_to_stride + self.pad_gt = pad_gt + + def __call__(self, samples, context=None): + """ + Args: + samples (list): a batch of sample, each is dict. + """ + coarsest_stride = self.pad_to_stride + + max_shape = np.array([data['image'].shape for data in samples]).max( + axis=0) + if coarsest_stride > 0: + max_shape[1] = int( + np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) + max_shape[2] = int( + np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) + + for data in samples: + im = data['image'] + im_c, im_h, im_w = im.shape[:] + padding_im = np.zeros( + (im_c, max_shape[1], max_shape[2]), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + data['image'] = padding_im + if 'semantic' in data and data['semantic'] is not None: + semantic = data['semantic'] + padding_sem = np.zeros( + (1, max_shape[1], max_shape[2]), dtype=np.float32) + padding_sem[:, :im_h, :im_w] = semantic + data['semantic'] = padding_sem + if 'gt_segm' in data and data['gt_segm'] is not None: + gt_segm = data['gt_segm'] + padding_segm = np.zeros( + (gt_segm.shape[0], max_shape[1], max_shape[2]), + dtype=np.uint8) + padding_segm[:, :im_h, :im_w] = gt_segm + data['gt_segm'] = padding_segm + if self.pad_gt: + gt_num = [] + if 'gt_poly' in data and data['gt_poly'] is not None and len(data[ + 'gt_poly']) > 0: + pad_mask = True + else: + pad_mask = False + + if pad_mask: + poly_num = [] + poly_part_num = [] + point_num = [] + for data in samples: + gt_num.append(data['gt_bbox'].shape[0]) + if pad_mask: + poly_num.append(len(data['gt_poly'])) + for poly in data['gt_poly']: + poly_part_num.append(int(len(poly))) + for p_p in poly: + point_num.append(int(len(p_p) / 2)) + gt_num_max = max(gt_num) + + for i, sample in enumerate(samples): + assert 'gt_rbox' in sample + assert 'gt_rbox2poly' in sample + gt_box_data = -np.ones([gt_num_max, 4], dtype=np.float32) + gt_class_data = -np.ones([gt_num_max], dtype=np.int32) + is_crowd_data = np.ones([gt_num_max], dtype=np.int32) + + if pad_mask: + poly_num_max = max(poly_num) + poly_part_num_max = max(poly_part_num) + point_num_max = max(point_num) + gt_masks_data = -np.ones( + [poly_num_max, poly_part_num_max, point_num_max, 2], + dtype=np.float32) + + gt_num = sample['gt_bbox'].shape[0] + gt_box_data[0:gt_num, :] = sample['gt_bbox'] + gt_class_data[0:gt_num] = np.squeeze(sample['gt_class']) + is_crowd_data[0:gt_num] = np.squeeze(sample['is_crowd']) + if pad_mask: + for j, poly in enumerate(sample['gt_poly']): + for k, p_p in enumerate(poly): + pp_np = np.array(p_p).reshape(-1, 2) + gt_masks_data[j, k, :pp_np.shape[0], :] = pp_np + sample['gt_poly'] = gt_masks_data + sample['gt_bbox'] = gt_box_data + sample['gt_class'] = gt_class_data + sample['is_crowd'] = is_crowd_data + # ploy to rbox + polys = sample['gt_rbox2poly'] + rbox = bbox_utils.poly_to_rbox(polys) + sample['gt_rbox'] = rbox + + return samples diff --git a/ppdet/data/transform/gridmask_utils.py b/ppdet/data/transform/gridmask_utils.py new file mode 100644 index 0000000..b0a27f0 --- /dev/null +++ b/ppdet/data/transform/gridmask_utils.py @@ -0,0 +1,83 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import numpy as np +from PIL import Image + + +class Gridmask(object): + def __init__(self, + use_h=True, + use_w=True, + rotate=1, + offset=False, + ratio=0.5, + mode=1, + prob=0.7, + upper_iter=360000): + super(Gridmask, self).__init__() + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode = mode + self.prob = prob + self.st_prob = prob + self.upper_iter = upper_iter + + def __call__(self, x, curr_iter): + self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter) + if np.random.rand() > self.prob: + return x + h, w, _ = x.shape + hh = int(1.5 * h) + ww = int(1.5 * w) + d = np.random.randint(2, h) + self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh // d): + s = d * i + st_h + t = min(s + self.l, hh) + mask[s:t, :] *= 0 + if self.use_w: + for i in range(ww // d): + s = d * i + st_w + t = min(s + self.l, ww) + mask[:, s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + + w].astype(np.float32) + + if self.mode == 1: + mask = 1 - mask + mask = np.expand_dims(mask, axis=-1) + if self.offset: + offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32) + x = (x * mask + offset * (1 - mask)).astype(x.dtype) + else: + x = (x * mask).astype(x.dtype) + + return x diff --git a/ppdet/data/transform/op_helper.py b/ppdet/data/transform/op_helper.py new file mode 100644 index 0000000..02d2195 --- /dev/null +++ b/ppdet/data/transform/op_helper.py @@ -0,0 +1,464 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# this file contains helper methods for BBOX processing + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +import math +import cv2 + + +def meet_emit_constraint(src_bbox, sample_bbox): + center_x = (src_bbox[2] + src_bbox[0]) / 2 + center_y = (src_bbox[3] + src_bbox[1]) / 2 + if center_x >= sample_bbox[0] and \ + center_x <= sample_bbox[2] and \ + center_y >= sample_bbox[1] and \ + center_y <= sample_bbox[3]: + return True + return False + + +def clip_bbox(src_bbox): + src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0) + src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0) + src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0) + src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0) + return src_bbox + + +def bbox_area(src_bbox): + if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]: + return 0. + else: + width = src_bbox[2] - src_bbox[0] + height = src_bbox[3] - src_bbox[1] + return width * height + + +def is_overlap(object_bbox, sample_bbox): + if object_bbox[0] >= sample_bbox[2] or \ + object_bbox[2] <= sample_bbox[0] or \ + object_bbox[1] >= sample_bbox[3] or \ + object_bbox[3] <= sample_bbox[1]: + return False + else: + return True + + +def filter_and_process(sample_bbox, bboxes, labels, scores=None, + keypoints=None): + new_bboxes = [] + new_labels = [] + new_scores = [] + new_keypoints = [] + new_kp_ignore = [] + for i in range(len(bboxes)): + new_bbox = [0, 0, 0, 0] + obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]] + if not meet_emit_constraint(obj_bbox, sample_bbox): + continue + if not is_overlap(obj_bbox, sample_bbox): + continue + sample_width = sample_bbox[2] - sample_bbox[0] + sample_height = sample_bbox[3] - sample_bbox[1] + new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width + new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height + new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width + new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height + new_bbox = clip_bbox(new_bbox) + if bbox_area(new_bbox) > 0: + new_bboxes.append(new_bbox) + new_labels.append([labels[i][0]]) + if scores is not None: + new_scores.append([scores[i][0]]) + if keypoints is not None: + sample_keypoint = keypoints[0][i] + for j in range(len(sample_keypoint)): + kp_len = sample_height if j % 2 else sample_width + sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0] + sample_keypoint[j] = ( + sample_keypoint[j] - sample_coord) / kp_len + sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0) + new_keypoints.append(sample_keypoint) + new_kp_ignore.append(keypoints[1][i]) + + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + if keypoints is not None: + keypoints = np.array(new_keypoints) + new_kp_ignore = np.array(new_kp_ignore) + return bboxes, labels, scores, (keypoints, new_kp_ignore) + return bboxes, labels, scores + + +def bbox_area_sampling(bboxes, labels, scores, target_size, min_size): + new_bboxes = [] + new_labels = [] + new_scores = [] + for i, bbox in enumerate(bboxes): + w = float((bbox[2] - bbox[0]) * target_size) + h = float((bbox[3] - bbox[1]) * target_size) + if w * h < float(min_size * min_size): + continue + else: + new_bboxes.append(bbox) + new_labels.append(labels[i]) + if scores is not None and scores.size != 0: + new_scores.append(scores[i]) + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + return bboxes, labels, scores + + +def generate_sample_bbox(sampler): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def generate_sample_bbox_square(sampler, image_width, image_height): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + if image_height < image_width: + bbox_width = bbox_height * image_height / image_width + else: + bbox_height = bbox_width * image_width / image_height + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array, + resize_width): + num_gt = len(bbox_labels) + # np.random.randint range: [low, high) + rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 + + if num_gt != 0: + norm_xmin = bbox_labels[rand_idx][0] + norm_ymin = bbox_labels[rand_idx][1] + norm_xmax = bbox_labels[rand_idx][2] + norm_ymax = bbox_labels[rand_idx][3] + + xmin = norm_xmin * image_width + ymin = norm_ymin * image_height + wid = image_width * (norm_xmax - norm_xmin) + hei = image_height * (norm_ymax - norm_ymin) + range_size = 0 + + area = wid * hei + for scale_ind in range(0, len(scale_array) - 1): + if area > scale_array[scale_ind] ** 2 and area < \ + scale_array[scale_ind + 1] ** 2: + range_size = scale_ind + 1 + break + + if area > scale_array[len(scale_array) - 2]**2: + range_size = len(scale_array) - 2 + + scale_choose = 0.0 + if range_size == 0: + rand_idx_size = 0 + else: + # np.random.randint range: [low, high) + rng_rand_size = np.random.randint(0, range_size + 1) + rand_idx_size = rng_rand_size % (range_size + 1) + + if rand_idx_size == range_size: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = min(2.0 * scale_array[rand_idx_size], + 2 * math.sqrt(wid * hei)) + scale_choose = random.uniform(min_resize_val, max_resize_val) + else: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = 2.0 * scale_array[rand_idx_size] + scale_choose = random.uniform(min_resize_val, max_resize_val) + + sample_bbox_size = wid * resize_width / scale_choose + + w_off_orig = 0.0 + h_off_orig = 0.0 + if sample_bbox_size < max(image_height, image_width): + if wid <= sample_bbox_size: + w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, + xmin) + else: + w_off_orig = np.random.uniform(xmin, + xmin + wid - sample_bbox_size) + + if hei <= sample_bbox_size: + h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, + ymin) + else: + h_off_orig = np.random.uniform(ymin, + ymin + hei - sample_bbox_size) + + else: + w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) + h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) + + w_off_orig = math.floor(w_off_orig) + h_off_orig = math.floor(h_off_orig) + + # Figure out top left coordinates. + w_off = float(w_off_orig / image_width) + h_off = float(h_off_orig / image_height) + + sampled_bbox = [ + w_off, h_off, w_off + float(sample_bbox_size / image_width), + h_off + float(sample_bbox_size / image_height) + ] + return sampled_bbox + else: + return 0 + + +def jaccard_overlap(sample_bbox, object_bbox): + if sample_bbox[0] >= object_bbox[2] or \ + sample_bbox[2] <= object_bbox[0] or \ + sample_bbox[1] >= object_bbox[3] or \ + sample_bbox[3] <= object_bbox[1]: + return 0 + intersect_xmin = max(sample_bbox[0], object_bbox[0]) + intersect_ymin = max(sample_bbox[1], object_bbox[1]) + intersect_xmax = min(sample_bbox[2], object_bbox[2]) + intersect_ymax = min(sample_bbox[3], object_bbox[3]) + intersect_size = (intersect_xmax - intersect_xmin) * ( + intersect_ymax - intersect_ymin) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) + overlap = intersect_size / ( + sample_bbox_size + object_bbox_size - intersect_size) + return overlap + + +def intersect_bbox(bbox1, bbox2): + if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \ + bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]: + intersection_box = [0.0, 0.0, 0.0, 0.0] + else: + intersection_box = [ + max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]), + min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3]) + ] + return intersection_box + + +def bbox_coverage(bbox1, bbox2): + inter_box = intersect_bbox(bbox1, bbox2) + intersect_size = bbox_area(inter_box) + + if intersect_size > 0: + bbox1_size = bbox_area(bbox1) + return intersect_size / bbox1_size + else: + return 0. + + +def satisfy_sample_constraint(sampler, + sample_bbox, + gt_bboxes, + satisfy_all=False): + if sampler[6] == 0 and sampler[7] == 0: + return True + satisfied = [] + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + satisfied.append(False) + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + satisfied.append(False) + continue + satisfied.append(True) + if not satisfy_all: + return True + + if satisfy_all: + return np.all(satisfied) + else: + return False + + +def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes): + if sampler[6] == 0 and sampler[7] == 0: + has_jaccard_overlap = False + else: + has_jaccard_overlap = True + if sampler[8] == 0 and sampler[9] == 0: + has_object_coverage = False + else: + has_object_coverage = True + + if not has_jaccard_overlap and not has_object_coverage: + return True + found = False + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + if has_jaccard_overlap: + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + continue + found = True + if has_object_coverage: + object_coverage = bbox_coverage(object_bbox, sample_bbox) + if sampler[8] != 0 and \ + object_coverage < sampler[8]: + continue + if sampler[9] != 0 and \ + object_coverage > sampler[9]: + continue + found = True + if found: + return True + return found + + +def crop_image_sampling(img, sample_bbox, image_width, image_height, + target_size): + # no clipping here + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + + w_off = xmin + h_off = ymin + width = xmax - xmin + height = ymax - ymin + cross_xmin = max(0.0, float(w_off)) + cross_ymin = max(0.0, float(h_off)) + cross_xmax = min(float(w_off + width - 1.0), float(image_width)) + cross_ymax = min(float(h_off + height - 1.0), float(image_height)) + cross_width = cross_xmax - cross_xmin + cross_height = cross_ymax - cross_ymin + + roi_xmin = 0 if w_off >= 0 else abs(w_off) + roi_ymin = 0 if h_off >= 0 else abs(h_off) + roi_width = cross_width + roi_height = cross_height + + roi_y1 = int(roi_ymin) + roi_y2 = int(roi_ymin + roi_height) + roi_x1 = int(roi_xmin) + roi_x2 = int(roi_xmin + roi_width) + + cross_y1 = int(cross_ymin) + cross_y2 = int(cross_ymin + cross_height) + cross_x1 = int(cross_xmin) + cross_x2 = int(cross_xmin + cross_width) + + sample_img = np.zeros((height, width, 3)) + sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \ + img[cross_y1: cross_y2, cross_x1: cross_x2] + + sample_img = cv2.resize( + sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA) + + return sample_img + + +def is_poly(segm): + assert isinstance(segm, (list, dict)), \ + "Invalid segm type: {}".format(type(segm)) + return isinstance(segm, list) + + +def gaussian_radius(bbox_size, min_overlap): + height, width = bbox_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1**2 - 4 * a1 * c1) + radius1 = (b1 - sq1) / (2 * a1) + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2**2 - 4 * a2 * c2) + radius2 = (b2 - sq2) / (2 * a2) + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3**2 - 4 * a3 * c3) + radius3 = (b3 + sq3) / (2 * a3) + return min(radius1, radius2, radius3) + + +def draw_gaussian(heatmap, center, radius, k=1, delte=6): + diameter = 2 * radius + 1 + sigma = diameter / delte + gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma) + + x, y = center + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: + radius + right] + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + + +def gaussian2D(shape, sigma_x=1, sigma_y=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m + 1, -n:n + 1] + + h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * + sigma_y))) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py new file mode 100644 index 0000000..932c797 --- /dev/null +++ b/ppdet/data/transform/operators.py @@ -0,0 +1,2001 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# operators to process sample, +# eg: decode/resize/crop image + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +from numbers import Number, Integral + +import uuid +import random +import math +import numpy as np +import os +import copy + +import cv2 +from PIL import Image, ImageEnhance, ImageDraw + +from ppdet.core.workspace import serializable +from ppdet.modeling.layers import AnchorGrid +from ppdet.modeling import bbox_utils + +from .op_helper import (satisfy_sample_constraint, filter_and_process, + generate_sample_bbox, clip_bbox, data_anchor_sampling, + satisfy_sample_constraint_coverage, crop_image_sampling, + generate_sample_bbox_square, bbox_area_sampling, + is_poly, gaussian_radius, draw_gaussian) + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +registered_ops = [] + + +def register_op(cls): + registered_ops.append(cls.__name__) + if not hasattr(BaseOperator, cls.__name__): + setattr(BaseOperator, cls.__name__, cls) + else: + raise KeyError("The {} class has been registered.".format(cls.__name__)) + return serializable(cls) + + +class BboxError(ValueError): + pass + + +class ImageError(ValueError): + pass + + +class BaseOperator(object): + def __init__(self, name=None): + if name is None: + name = self.__class__.__name__ + self._id = name + '_' + str(uuid.uuid4())[-6:] + + def apply(self, sample, context=None): + """ Process a sample. + Args: + sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} + context (dict): info about this sample processing + Returns: + result (dict): a processed sample + """ + return sample + + def __call__(self, sample, context=None): + """ Process a sample. + Args: + sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} + context (dict): info about this sample processing + Returns: + result (dict): a processed sample + """ + if isinstance(sample, Sequence): + for i in range(len(sample)): + sample[i] = self.apply(sample[i], context) + else: + sample = self.apply(sample, context) + return sample + + def __str__(self): + return str(self._id) + + +@register_op +class Decode(BaseOperator): + def __init__(self): + """ Transform the image data to numpy format following the rgb format + """ + super(Decode, self).__init__() + + def apply(self, sample, context=None): + """ load image if 'im_file' field is not empty but 'image' is""" + if 'image' not in sample: + with open(sample['im_file'], 'rb') as f: + sample['image'] = f.read() + sample.pop('im_file') + + im = sample['image'] + data = np.frombuffer(im, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + + sample['image'] = im + if 'h' not in sample: + sample['h'] = im.shape[0] + elif sample['h'] != im.shape[0]: + logger.warn( + "The actual image height: {} is not equal to the " + "height: {} in annotation, and update sample['h'] by actual " + "image height.".format(im.shape[0], sample['h'])) + sample['h'] = im.shape[0] + if 'w' not in sample: + sample['w'] = im.shape[1] + elif sample['w'] != im.shape[1]: + logger.warn( + "The actual image width: {} is not equal to the " + "width: {} in annotation, and update sample['w'] by actual " + "image width.".format(im.shape[1], sample['w'])) + sample['w'] = im.shape[1] + + sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) + sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) + return sample + + +@register_op +class Permute(BaseOperator): + def __init__(self): + """ + Change the channel to be (C, H, W) + """ + super(Permute, self).__init__() + + def apply(self, sample, context=None): + im = sample['image'] + im = im.transpose((2, 0, 1)) + sample['image'] = im + return sample + + +@register_op +class Lighting(BaseOperator): + """ + Lighting the image by eigenvalues and eigenvectors + Args: + eigval (list): eigenvalues + eigvec (list): eigenvectors + alphastd (float): random weight of lighting, 0.1 by default + """ + + def __init__(self, eigval, eigvec, alphastd=0.1): + super(Lighting, self).__init__() + self.alphastd = alphastd + self.eigval = np.array(eigval).astype('float32') + self.eigvec = np.array(eigvec).astype('float32') + + def apply(self, sample, context=None): + alpha = np.random.normal(scale=self.alphastd, size=(3, )) + sample['image'] += np.dot(self.eigvec, self.eigval * alpha) + return sample + + +@register_op +class RandomErasingImage(BaseOperator): + def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3): + """ + Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896 + Args: + prob (float): probability to carry out random erasing + lower (float): lower limit of the erasing area ratio + heigher (float): upper limit of the erasing area ratio + aspect_ratio (float): aspect ratio of the erasing region + """ + super(RandomErasingImage, self).__init__() + self.prob = prob + self.lower = lower + self.heigher = heigher + self.aspect_ratio = aspect_ratio + + def apply(self, sample): + gt_bbox = sample['gt_bbox'] + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + + for idx in range(gt_bbox.shape[0]): + if self.prob <= np.random.rand(): + continue + + x1, y1, x2, y2 = gt_bbox[idx, :] + w_bbox = x2 - x1 + h_bbox = y2 - y1 + area = w_bbox * h_bbox + + target_area = random.uniform(self.lower, self.higher) * area + aspect_ratio = random.uniform(self.aspect_ratio, + 1 / self.aspect_ratio) + + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + + if w < w_bbox and h < h_bbox: + off_y1 = random.randint(0, int(h_bbox - h)) + off_x1 = random.randint(0, int(w_bbox - w)) + im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int( + x1 + off_x1 + w), :] = 0 + sample['image'] = im + return sample + + +@register_op +class NormalizeImage(BaseOperator): + def __init__(self, mean=[0.485, 0.456, 0.406], std=[1, 1, 1], + is_scale=True): + """ + Args: + mean (list): the pixel mean + std (list): the pixel variance + """ + super(NormalizeImage, self).__init__() + self.mean = mean + self.std = std + self.is_scale = is_scale + if not (isinstance(self.mean, list) and isinstance(self.std, list) and + isinstance(self.is_scale, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def apply(self, sample, context=None): + """Normalize the image. + Operators: + 1.(optional) Scale the image to [0,1] + 2. Each pixel minus mean and is divided by std + """ + im = sample['image'] + im = im.astype(np.float32, copy=False) + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + + if self.is_scale: + im = im / 255.0 + + im -= mean + im /= std + + sample['image'] = im + return sample + + +@register_op +class GridMask(BaseOperator): + def __init__(self, + use_h=True, + use_w=True, + rotate=1, + offset=False, + ratio=0.5, + mode=1, + prob=0.7, + upper_iter=360000): + """ + GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086 + Args: + use_h (bool): whether to mask vertically + use_w (boo;): whether to mask horizontally + rotate (float): angle for the mask to rotate + offset (float): mask offset + ratio (float): mask ratio + mode (int): gridmask mode + prob (float): max probability to carry out gridmask + upper_iter (int): suggested to be equal to global max_iter + """ + super(GridMask, self).__init__() + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode = mode + self.prob = prob + self.upper_iter = upper_iter + + from .gridmask_utils import Gridmask + self.gridmask_op = Gridmask( + use_h, + use_w, + rotate=rotate, + offset=offset, + ratio=ratio, + mode=mode, + prob=prob, + upper_iter=upper_iter) + + def apply(self, sample, context=None): + sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter']) + return sample + + +@register_op +class RandomDistort(BaseOperator): + """Random color distortion. + Args: + hue (list): hue settings. in [lower, upper, probability] format. + saturation (list): saturation settings. in [lower, upper, probability] format. + contrast (list): contrast settings. in [lower, upper, probability] format. + brightness (list): brightness settings. in [lower, upper, probability] format. + random_apply (bool): whether to apply in random (yolo) or fixed (SSD) + order. + count (int): the number of doing distrot + random_channel (bool): whether to swap channels randomly + """ + + def __init__(self, + hue=[-18, 18, 0.5], + saturation=[0.5, 1.5, 0.5], + contrast=[0.5, 1.5, 0.5], + brightness=[0.5, 1.5, 0.5], + random_apply=True, + count=4, + random_channel=False): + super(RandomDistort, self).__init__() + self.hue = hue + self.saturation = saturation + self.contrast = contrast + self.brightness = brightness + self.random_apply = random_apply + self.count = count + self.random_channel = random_channel + + def apply_hue(self, img): + low, high, prob = self.hue + if np.random.uniform(0., 1.) < prob: + return img + + img = img.astype(np.float32) + # it works, but result differ from HSV version + delta = np.random.uniform(low, high) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321], + [0.211, -0.523, 0.311]]) + ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647], + [1.0, -1.107, 1.705]]) + t = np.dot(np.dot(ityiq, bt), tyiq).T + img = np.dot(img, t) + return img + + def apply_saturation(self, img): + low, high, prob = self.saturation + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + # it works, but result differ from HSV version + gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + return img + + def apply_contrast(self, img): + low, high, prob = self.contrast + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + img *= delta + return img + + def apply_brightness(self, img): + low, high, prob = self.brightness + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + img += delta + return img + + def apply(self, sample, context=None): + img = sample['image'] + if self.random_apply: + functions = [ + self.apply_brightness, self.apply_contrast, + self.apply_saturation, self.apply_hue + ] + distortions = np.random.permutation(functions)[:self.count] + for func in distortions: + img = func(img) + sample['image'] = img + return sample + + img = self.apply_brightness(img) + mode = np.random.randint(0, 2) + + if mode: + img = self.apply_contrast(img) + + img = self.apply_saturation(img) + img = self.apply_hue(img) + + if not mode: + img = self.apply_contrast(img) + + if self.random_channel: + if np.random.randint(0, 2): + img = img[..., np.random.permutation(3)] + sample['image'] = img + return sample + + +@register_op +class AutoAugment(BaseOperator): + def __init__(self, autoaug_type="v1"): + """ + Args: + autoaug_type (str): autoaug type, support v0, v1, v2, v3, test + """ + super(AutoAugment, self).__init__() + self.autoaug_type = autoaug_type + + def apply(self, sample, context=None): + """ + Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172 + """ + im = sample['image'] + gt_bbox = sample['gt_bbox'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + if len(gt_bbox) == 0: + return sample + + height, width, _ = im.shape + norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32) + norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height) + norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width) + norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height) + norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width) + + from .autoaugment_utils import distort_image_with_autoaugment + im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox, + self.autoaug_type) + + gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width) + gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height) + gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width) + gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height) + + sample['image'] = im + sample['gt_bbox'] = gt_bbox + return sample + + +@register_op +class RandomFlip(BaseOperator): + def __init__(self, prob=0.5): + """ + Args: + prob (float): the probability of flipping image + """ + super(RandomFlip, self).__init__() + self.prob = prob + if not (isinstance(self.prob, float)): + raise TypeError("{}: input type is invalid.".format(self)) + + def apply_segm(self, segms, height, width): + def _flip_poly(poly, width): + flipped_poly = np.array(poly) + flipped_poly[0::2] = width - np.array(poly[0::2]) + return flipped_poly.tolist() + + def _flip_rle(rle, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[:, ::-1] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + flipped_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + flipped_segms.append([_flip_poly(poly, width) for poly in segm]) + else: + # RLE format + import pycocotools.mask as mask_util + flipped_segms.append(_flip_rle(segm, height, width)) + return flipped_segms + + def apply_keypoint(self, gt_keypoint, width): + for i in range(gt_keypoint.shape[1]): + if i % 2 == 0: + old_x = gt_keypoint[:, i].copy() + gt_keypoint[:, i] = width - old_x + return gt_keypoint + + def apply_image(self, image): + return image[:, ::-1, :] + + def apply_bbox(self, bbox, width): + oldx1 = bbox[:, 0].copy() + oldx2 = bbox[:, 2].copy() + bbox[:, 0] = width - oldx2 + bbox[:, 2] = width - oldx1 + return bbox + + def apply_rbox(self, bbox, width): + oldx1 = bbox[:, 0].copy() + oldx2 = bbox[:, 2].copy() + oldx3 = bbox[:, 4].copy() + oldx4 = bbox[:, 6].copy() + bbox[:, 0] = width - oldx1 + bbox[:, 2] = width - oldx2 + bbox[:, 4] = width - oldx3 + bbox[:, 6] = width - oldx4 + bbox = [bbox_utils.get_best_begin_point_single(e) for e in bbox] + return bbox + + def apply(self, sample, context=None): + """Filp the image and bounding box. + Operators: + 1. Flip the image numpy. + 2. Transform the bboxes' x coordinates. + (Must judge whether the coordinates are normalized!) + 3. Transform the segmentations' x coordinates. + (Must judge whether the coordinates are normalized!) + Output: + sample: the image, bounding box and segmentation part + in sample are flipped. + """ + if np.random.uniform(0, 1) < self.prob: + im = sample['image'] + height, width = im.shape[:2] + im = self.apply_image(im) + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width) + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height, + width) + if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: + sample['gt_keypoint'] = self.apply_keypoint( + sample['gt_keypoint'], width) + + if 'semantic' in sample and sample['semantic']: + sample['semantic'] = sample['semantic'][:, ::-1] + + if 'gt_segm' in sample and sample['gt_segm'].any(): + sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] + + if 'gt_rbox2poly' in sample and sample['gt_rbox2poly'].any(): + sample['gt_rbox2poly'] = self.apply_rbox(sample['gt_rbox2poly'], + width) + + sample['flipped'] = True + sample['image'] = im + return sample + + +@register_op +class Resize(BaseOperator): + def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): + """ + Resize image to target size. if keep_ratio is True, + resize the image's long side to the maximum of target_size + if keep_ratio is False, resize the image to target size(h, w) + Args: + target_size (int|list): image target size + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): the interpolation method + """ + super(Resize, self).__init__() + self.keep_ratio = keep_ratio + self.interp = interp + if not isinstance(target_size, (Integral, Sequence)): + raise TypeError( + "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". + format(type(target_size))) + if isinstance(target_size, Integral): + target_size = [target_size, target_size] + self.target_size = target_size + + def apply_image(self, image, scale): + im_scale_x, im_scale_y = scale + + return cv2.resize( + image, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + + def apply_bbox(self, bbox, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + bbox[:, 0::2] *= im_scale_x + bbox[:, 1::2] *= im_scale_y + bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) + bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) + return bbox + + def apply_segm(self, segms, im_size, scale): + def _resize_poly(poly, im_scale_x, im_scale_y): + resized_poly = np.array(poly) + resized_poly[0::2] *= im_scale_x + resized_poly[1::2] *= im_scale_y + return resized_poly.tolist() + + def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, im_h, im_w) + + mask = mask_util.decode(rle) + mask = cv2.resize( + image, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + im_h, im_w = im_size + im_scale_x, im_scale_y = scale + resized_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + resized_segms.append([ + _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm + ]) + else: + # RLE format + import pycocotools.mask as mask_util + resized_segms.append( + _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) + + return resized_segms + + def apply(self, sample, context=None): + """ Resize the image numpy. + """ + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + + # apply image + im_shape = im.shape + if self.keep_ratio: + + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + target_size_min = np.min(self.target_size) + target_size_max = np.max(self.target_size) + + im_scale = min(target_size_min / im_size_min, + target_size_max / im_size_max) + + resize_h = im_scale * float(im_shape[0]) + resize_w = im_scale * float(im_shape[1]) + + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = self.target_size + im_scale_y = resize_h / im_shape[0] + im_scale_x = resize_w / im_shape[1] + + im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) + sample['image'] = im + sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + # apply rbox + if 'gt_rbox2poly' in sample: + if np.array(sample['gt_rbox2poly']).shape[1] != 8: + logger.warn( + "gt_rbox2poly's length shoule be 8, but actually is {}". + format(len(sample['gt_rbox2poly']))) + sample['gt_rbox2poly'] = self.apply_bbox(sample['gt_rbox2poly'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], + [im_scale_x, im_scale_y]) + + # apply semantic + if 'semantic' in sample and sample['semantic']: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + + return sample + + +@register_op +class MultiscaleTestResize(BaseOperator): + def __init__(self, + origin_target_size=[800, 1333], + target_size=[], + interp=cv2.INTER_LINEAR, + use_flip=True): + """ + Rescale image to the each size in target size, and capped at max_size. + Args: + origin_target_size (list): origin target size of image + target_size (list): A list of target sizes of image. + interp (int): the interpolation method. + use_flip (bool): whether use flip augmentation. + """ + super(MultiscaleTestResize, self).__init__() + self.interp = interp + self.use_flip = use_flip + + if not isinstance(target_size, Sequence): + raise TypeError( + "Type of target_size is invalid. Must be List or Tuple, now is {}". + format(type(target_size))) + self.target_size = target_size + + if not isinstance(origin_target_size, Sequence): + raise TypeError( + "Type of origin_target_size is invalid. Must be List or Tuple, now is {}". + format(type(origin_target_size))) + + self.origin_target_size = origin_target_size + + def apply(self, sample, context=None): + """ Resize the image numpy for multi-scale test. + """ + samples = [] + resizer = Resize( + self.origin_target_size, keep_ratio=True, interp=self.interp) + samples.append(resizer(sample.copy(), context)) + if self.use_flip: + flipper = RandomFlip(1.1) + samples.append(flipper(sample.copy(), context=context)) + + for size in self.target_size: + resizer = Resize(size, keep_ratio=True, interp=self.interp) + samples.append(resizer(sample.copy(), context)) + + return samples + + +@register_op +class RandomResize(BaseOperator): + def __init__(self, + target_size, + keep_ratio=True, + interp=cv2.INTER_LINEAR, + random_size=True, + random_interp=False): + """ + Resize image to target size randomly. random target_size and interpolation method + Args: + target_size (int, list, tuple): image target size, if random size is True, must be list or tuple + keep_ratio (bool): whether keep_raio or not, default true + interp (int): the interpolation method + random_size (bool): whether random select target size of image + random_interp (bool): whether random select interpolation method + """ + super(RandomResize, self).__init__() + self.keep_ratio = keep_ratio + self.interp = interp + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + assert isinstance(target_size, ( + Integral, Sequence)), "target_size must be Integer, List or Tuple" + if random_size and not isinstance(target_size, Sequence): + raise TypeError( + "Type of target_size is invalid when random_size is True. Must be List or Tuple, now is {}". + format(type(target_size))) + self.target_size = target_size + self.random_size = random_size + self.random_interp = random_interp + + def apply(self, sample, context=None): + """ Resize the image numpy. + """ + if self.random_size: + target_size = random.choice(self.target_size) + else: + target_size = self.target_size + + if self.random_interp: + interp = random.choice(self.interps) + else: + interp = self.interp + + resizer = Resize(target_size, self.keep_ratio, interp) + return resizer(sample, context=context) + + +@register_op +class RandomExpand(BaseOperator): + """Random expand the canvas. + Args: + ratio (float): maximum expansion ratio. + prob (float): probability to expand. + fill_value (list): color value used to fill the canvas. in RGB order. + """ + + def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): + super(RandomExpand, self).__init__() + assert ratio > 1.01, "expand ratio must be larger than 1.01" + self.ratio = ratio + self.prob = prob + assert isinstance(fill_value, (Number, Sequence)), \ + "fill value must be either float or sequence" + if isinstance(fill_value, Number): + fill_value = (fill_value, ) * 3 + if not isinstance(fill_value, tuple): + fill_value = tuple(fill_value) + self.fill_value = fill_value + + def apply(self, sample, context=None): + if np.random.uniform(0., 1.) < self.prob: + return sample + + im = sample['image'] + height, width = im.shape[:2] + ratio = np.random.uniform(1., self.ratio) + h = int(height * ratio) + w = int(width * ratio) + if not h > height or not w > width: + return sample + y = np.random.randint(0, h - height) + x = np.random.randint(0, w - width) + offsets, size = [x, y], [h, w] + + pad = Pad(size, + pad_mode=-1, + offsets=offsets, + fill_value=self.fill_value) + + return pad(sample, context=context) + + +@register_op +class CropWithSampling(BaseOperator): + def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True): + """ + Args: + batch_sampler (list): Multiple sets of different + parameters for cropping. + satisfy_all (bool): whether all boxes must satisfy. + e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap] + avoid_no_bbox (bool): whether to to avoid the + situation where the box does not appear. + """ + super(CropWithSampling, self).__init__() + self.batch_sampler = batch_sampler + self.satisfy_all = satisfy_all + self.avoid_no_bbox = avoid_no_bbox + + def apply(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + im_height, im_width = im.shape[:2] + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox, + self.satisfy_all): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + crop_bbox, crop_class, crop_score = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score) + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * im_width) + xmax = int(sample_bbox[2] * im_width) + ymin = int(sample_bbox[1] * im_height) + ymax = int(sample_bbox[3] * im_height) + im = im[ymin:ymax, xmin:xmax] + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + sample['gt_score'] = crop_score + return sample + return sample + + +@register_op +class CropWithDataAchorSampling(BaseOperator): + def __init__(self, + batch_sampler, + anchor_sampler=None, + target_size=None, + das_anchor_scales=[16, 32, 64, 128], + sampling_prob=0.5, + min_size=8., + avoid_no_bbox=True): + """ + Args: + anchor_sampler (list): anchor_sampling sets of different + parameters for cropping. + batch_sampler (list): Multiple sets of different + parameters for cropping. + e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]] + [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap, min coverage, max coverage] + target_size (int): target image size. + das_anchor_scales (list[float]): a list of anchor scales in data + anchor smapling. + min_size (float): minimum size of sampled bbox. + avoid_no_bbox (bool): whether to to avoid the + situation where the box does not appear. + """ + super(CropWithDataAchorSampling, self).__init__() + self.anchor_sampler = anchor_sampler + self.batch_sampler = batch_sampler + self.target_size = target_size + self.sampling_prob = sampling_prob + self.min_size = min_size + self.avoid_no_bbox = avoid_no_bbox + self.das_anchor_scales = np.array(das_anchor_scales) + + def apply(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + image_height, image_width = im.shape[:2] + gt_bbox[:, 0] /= image_width + gt_bbox[:, 1] /= image_height + gt_bbox[:, 2] /= image_width + gt_bbox[:, 3] /= image_height + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + + prob = np.random.uniform(0., 1.) + if prob > self.sampling_prob: # anchor sampling + assert self.anchor_sampler + for sampler in self.anchor_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = data_anchor_sampling( + gt_bbox, image_width, image_height, + self.das_anchor_scales, self.target_size) + if sample_bbox == 0: + break + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + + if 'gt_keypoint' in sample.keys(): + keypoints = (sample['gt_keypoint'], + sample['keypoint_ignore']) + crop_bbox, crop_class, crop_score, gt_keypoints = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, + scores=gt_score, + keypoints=keypoints) + else: + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, scores=gt_score) + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + im = crop_image_sampling(im, sample_bbox, image_width, + image_height, self.target_size) + height, width = im.shape[:2] + crop_bbox[:, 0] *= width + crop_bbox[:, 1] *= height + crop_bbox[:, 2] *= width + crop_bbox[:, 3] *= height + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + if 'gt_score' in sample: + sample['gt_score'] = crop_score + if 'gt_keypoint' in sample.keys(): + sample['gt_keypoint'] = gt_keypoints[0] + sample['keypoint_ignore'] = gt_keypoints[1] + return sample + return sample + + else: + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox_square( + sampler, image_width, image_height) + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + + if 'gt_keypoint' in sample.keys(): + keypoints = (sample['gt_keypoint'], + sample['keypoint_ignore']) + crop_bbox, crop_class, crop_score, gt_keypoints = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, + scores=gt_score, + keypoints=keypoints) + else: + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, scores=gt_score) + # sampling bbox according the bbox area + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + im = im[ymin:ymax, xmin:xmax] + height, width = im.shape[:2] + crop_bbox[:, 0] *= width + crop_bbox[:, 1] *= height + crop_bbox[:, 2] *= width + crop_bbox[:, 3] *= height + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + if 'gt_score' in sample: + sample['gt_score'] = crop_score + if 'gt_keypoint' in sample.keys(): + sample['gt_keypoint'] = gt_keypoints[0] + sample['keypoint_ignore'] = gt_keypoints[1] + return sample + return sample + + +@register_op +class RandomCrop(BaseOperator): + """Random crop image and bboxes. + Args: + aspect_ratio (list): aspect ratio of cropped region. + in [min, max] format. + thresholds (list): iou thresholds for decide a valid bbox crop. + scaling (list): ratio between a cropped region and the original image. + in [min, max] format. + num_attempts (int): number of tries before giving up. + allow_no_crop (bool): allow return without actually cropping them. + cover_all_box (bool): ensure all bboxes are covered in the final crop. + is_mask_crop(bool): whether crop the segmentation. + """ + + def __init__(self, + aspect_ratio=[.5, 2.], + thresholds=[.0, .1, .3, .5, .7, .9], + scaling=[.3, 1.], + num_attempts=50, + allow_no_crop=True, + cover_all_box=False, + is_mask_crop=False): + super(RandomCrop, self).__init__() + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + self.is_mask_crop = is_mask_crop + + def crop_segms(self, segms, valid_ids, crop, height, width): + def _crop_poly(segm, crop): + xmin, ymin, xmax, ymax = crop + crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] + crop_p = np.array(crop_coord).reshape(4, 2) + crop_p = Polygon(crop_p) + + crop_segm = list() + for poly in segm: + poly = np.array(poly).reshape(len(poly) // 2, 2) + polygon = Polygon(poly) + if not polygon.is_valid: + exterior = polygon.exterior + multi_lines = exterior.intersection(exterior) + polygons = shapely.ops.polygonize(multi_lines) + polygon = MultiPolygon(polygons) + multi_polygon = list() + if isinstance(polygon, MultiPolygon): + multi_polygon = copy.deepcopy(polygon) + else: + multi_polygon.append(copy.deepcopy(polygon)) + for per_polygon in multi_polygon: + inter = per_polygon.intersection(crop_p) + if not inter: + continue + if isinstance(inter, (MultiPolygon, GeometryCollection)): + for part in inter: + if not isinstance(part, Polygon): + continue + part = np.squeeze( + np.array(part.exterior.coords[:-1]).reshape(1, + -1)) + part[0::2] -= xmin + part[1::2] -= ymin + crop_segm.append(part.tolist()) + elif isinstance(inter, Polygon): + crop_poly = np.squeeze( + np.array(inter.exterior.coords[:-1]).reshape(1, -1)) + crop_poly[0::2] -= xmin + crop_poly[1::2] -= ymin + crop_segm.append(crop_poly.tolist()) + else: + continue + return crop_segm + + def _crop_rle(rle, crop, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[crop[1]:crop[3], crop[0]:crop[2]] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + crop_segms = [] + for id in valid_ids: + segm = segms[id] + if is_poly(segm): + import copy + import shapely.ops + from shapely.geometry import Polygon, MultiPolygon, GeometryCollection + logging.getLogger("shapely").setLevel(logging.WARNING) + # Polygon format + crop_segms.append(_crop_poly(segm, crop)) + else: + # RLE format + import pycocotools.mask as mask_util + crop_segms.append(_crop_rle(segm, crop, height, width)) + return crop_segms + + def apply(self, sample, context=None): + if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: + return sample + + h, w = sample['image'].shape[:2] + gt_bbox = sample['gt_bbox'] + + # NOTE Original method attempts to generate one candidate for each + # threshold then randomly sample one from the resulting list. + # Here a short circuit approach is taken, i.e., randomly choose a + # threshold and attempt to find a valid crop, and simply return the + # first one found. + # The probability is not exactly the same, kinda resembling the + # "Monty Hall" problem. Actually carrying out the attempts will affect + # observability (just like opening doors in the "Monty Hall" game). + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + np.random.shuffle(thresholds) + + for thresh in thresholds: + if thresh == 'no_crop': + return sample + + found = False + for i in range(self.num_attempts): + scale = np.random.uniform(*self.scaling) + if self.aspect_ratio is not None: + min_ar, max_ar = self.aspect_ratio + aspect_ratio = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + h_scale = scale / np.sqrt(aspect_ratio) + w_scale = scale * np.sqrt(aspect_ratio) + else: + h_scale = np.random.uniform(*self.scaling) + w_scale = np.random.uniform(*self.scaling) + crop_h = h * h_scale + crop_w = w * w_scale + if self.aspect_ratio is None: + if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: + continue + + crop_h = int(crop_h) + crop_w = int(crop_w) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + iou = self._iou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + if self.cover_all_box and iou.min() < thresh: + continue + + cropped_box, valid_ids = self._crop_box_with_center_constraint( + gt_bbox, np.array( + crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + if self.is_mask_crop and 'gt_poly' in sample and len(sample[ + 'gt_poly']) > 0: + crop_polys = self.crop_segms( + sample['gt_poly'], + valid_ids, + np.array( + crop_box, dtype=np.int64), + h, + w) + if [] in crop_polys: + delete_id = list() + valid_polys = list() + for id, crop_poly in enumerate(crop_polys): + if crop_poly == []: + delete_id.append(id) + else: + valid_polys.append(crop_poly) + valid_ids = np.delete(valid_ids, delete_id) + if len(valid_polys) == 0: + return sample + sample['gt_poly'] = valid_polys + else: + sample['gt_poly'] = crop_polys + + if 'gt_segm' in sample: + sample['gt_segm'] = self._crop_segm(sample['gt_segm'], + crop_box) + sample['gt_segm'] = np.take( + sample['gt_segm'], valid_ids, axis=0) + + sample['image'] = self._crop_image(sample['image'], crop_box) + sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) + sample['gt_class'] = np.take( + sample['gt_class'], valid_ids, axis=0) + if 'gt_score' in sample: + sample['gt_score'] = np.take( + sample['gt_score'], valid_ids, axis=0) + + if 'is_crowd' in sample: + sample['is_crowd'] = np.take( + sample['is_crowd'], valid_ids, axis=0) + return sample + + return sample + + def _iou_matrix(self, a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + def _crop_box_with_center_constraint(self, box, crop): + cropped_box = box.copy() + + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and(crop[:2] <= centers, + centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return cropped_box, np.where(valid)[0] + + def _crop_image(self, img, crop): + x1, y1, x2, y2 = crop + return img[y1:y2, x1:x2, :] + + def _crop_segm(self, segm, crop): + x1, y1, x2, y2 = crop + return segm[:, y1:y2, x1:x2] + + +@register_op +class RandomScaledCrop(BaseOperator): + """Resize image and bbox based on long side (with optional random scaling), + then crop or pad image to target size. + Args: + target_dim (int): target size. + scale_range (list): random scale range. + interp (int): interpolation method, default to `cv2.INTER_LINEAR`. + """ + + def __init__(self, + target_dim=512, + scale_range=[.1, 2.], + interp=cv2.INTER_LINEAR): + super(RandomScaledCrop, self).__init__() + self.target_dim = target_dim + self.scale_range = scale_range + self.interp = interp + + def apply(self, sample, context=None): + img = sample['image'] + h, w = img.shape[:2] + random_scale = np.random.uniform(*self.scale_range) + dim = self.target_dim + random_dim = int(dim * random_scale) + dim_max = max(h, w) + scale = random_dim / dim_max + resize_w = w * scale + resize_h = h * scale + offset_x = int(max(0, np.random.uniform(0., resize_w - dim))) + offset_y = int(max(0, np.random.uniform(0., resize_h - dim))) + + img = cv2.resize(img, (resize_w, resize_h), interpolation=self.interp) + img = np.array(img) + canvas = np.zeros((dim, dim, 3), dtype=img.dtype) + canvas[:min(dim, resize_h), :min(dim, resize_w), :] = img[ + offset_y:offset_y + dim, offset_x:offset_x + dim, :] + sample['image'] = canvas + sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) + scale_factor = sample['sacle_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * scale, scale_factor[1] * scale], + dtype=np.float32) + + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + scale_array = np.array([scale, scale] * 2, dtype=np.float32) + shift_array = np.array([offset_x, offset_y] * 2, dtype=np.float32) + boxes = sample['gt_bbox'] * scale_array - shift_array + boxes = np.clip(boxes, 0, dim - 1) + # filter boxes with no area + area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1) + valid = (area > 1.).nonzero()[0] + sample['gt_bbox'] = boxes[valid] + sample['gt_class'] = sample['gt_class'][valid] + + return sample + + +@register_op +class Cutmix(BaseOperator): + def __init__(self, alpha=1.5, beta=1.5): + """ + CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899 + Cutmix image and gt_bbbox/gt_score + Args: + alpha (float): alpha parameter of beta distribute + beta (float): beta parameter of beta distribute + """ + super(Cutmix, self).__init__() + self.alpha = alpha + self.beta = beta + if self.alpha <= 0.0: + raise ValueError("alpha shold be positive in {}".format(self)) + if self.beta <= 0.0: + raise ValueError("beta shold be positive in {}".format(self)) + + def apply_image(self, img1, img2, factor): + """ _rand_bbox """ + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + cut_rat = np.sqrt(1. - factor) + + cut_w = np.int(w * cut_rat) + cut_h = np.int(h * cut_rat) + + # uniform + cx = np.random.randint(w) + cy = np.random.randint(h) + + bbx1 = np.clip(cx - cut_w // 2, 0, w - 1) + bby1 = np.clip(cy - cut_h // 2, 0, h - 1) + bbx2 = np.clip(cx + cut_w // 2, 0, w - 1) + bby2 = np.clip(cy + cut_h // 2, 0, h - 1) + + img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32') + img_1_pad[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') + img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32') + img_2_pad[:img2.shape[0], :img2.shape[1], :] = \ + img2.astype('float32') + img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :] + return img_1_pad + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len(sample) == 2, 'cutmix need two samples' + + factor = np.random.beta(self.alpha, self.beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + return sample[0] + if factor <= 0.0: + return sample[1] + img1 = sample[0]['image'] + img2 = sample[1]['image'] + img = self.apply_image(img1, img2, factor) + gt_bbox1 = sample[0]['gt_bbox'] + gt_bbox2 = sample[1]['gt_bbox'] + gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) + gt_class1 = sample[0]['gt_class'] + gt_class2 = sample[1]['gt_class'] + gt_class = np.concatenate((gt_class1, gt_class2), axis=0) + gt_score1 = np.ones_like(sample[0]['gt_class']) + gt_score2 = np.ones_like(sample[1]['gt_class']) + gt_score = np.concatenate( + (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) + result = copy.deepcopy(sample[0]) + result['image'] = img + result['gt_bbox'] = gt_bbox + result['gt_score'] = gt_score + result['gt_class'] = gt_class + if 'is_crowd' in sample[0]: + is_crowd1 = sample[0]['is_crowd'] + is_crowd2 = sample[1]['is_crowd'] + is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) + result['is_crowd'] = is_crowd + if 'difficult' in sample[0]: + is_difficult1 = sample[0]['difficult'] + is_difficult2 = sample[1]['difficult'] + is_difficult = np.concatenate( + (is_difficult1, is_difficult2), axis=0) + result['difficult'] = is_difficult + return result + + +@register_op +class Mixup(BaseOperator): + def __init__(self, alpha=1.5, beta=1.5): + """ Mixup image and gt_bbbox/gt_score + Args: + alpha (float): alpha parameter of beta distribute + beta (float): beta parameter of beta distribute + """ + super(Mixup, self).__init__() + self.alpha = alpha + self.beta = beta + if self.alpha <= 0.0: + raise ValueError("alpha shold be positive in {}".format(self)) + if self.beta <= 0.0: + raise ValueError("beta shold be positive in {}".format(self)) + + def apply_image(self, img1, img2, factor): + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + img = np.zeros((h, w, img1.shape[2]), 'float32') + img[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') * factor + img[:img2.shape[0], :img2.shape[1], :] += \ + img2.astype('float32') * (1.0 - factor) + return img.astype('uint8') + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len(sample) == 2, 'mixup need two samples' + + factor = np.random.beta(self.alpha, self.beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + return sample[0] + if factor <= 0.0: + return sample[1] + im = self.apply_image(sample[0]['image'], sample[1]['image'], factor) + result = copy.deepcopy(sample[0]) + result['image'] = im + # apply bbox and score + if 'gt_bbox' in sample[0]: + gt_bbox1 = sample[0]['gt_bbox'] + gt_bbox2 = sample[1]['gt_bbox'] + gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) + result['gt_bbox'] = gt_bbox + if 'gt_class' in sample[0]: + gt_class1 = sample[0]['gt_class'] + gt_class2 = sample[1]['gt_class'] + gt_class = np.concatenate((gt_class1, gt_class2), axis=0) + result['gt_class'] = gt_class + + gt_score1 = np.ones_like(sample[0]['gt_class']) + gt_score2 = np.ones_like(sample[1]['gt_class']) + gt_score = np.concatenate( + (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) + result['gt_score'] = gt_score + if 'is_crowd' in sample[0]: + is_crowd1 = sample[0]['is_crowd'] + is_crowd2 = sample[1]['is_crowd'] + is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) + result['is_crowd'] = is_crowd + if 'difficult' in sample[0]: + is_difficult1 = sample[0]['difficult'] + is_difficult2 = sample[1]['difficult'] + is_difficult = np.concatenate( + (is_difficult1, is_difficult2), axis=0) + result['difficult'] = is_difficult + + return result + + +@register_op +class NormalizeBox(BaseOperator): + """Transform the bounding box's coornidates to [0,1].""" + + def __init__(self): + super(NormalizeBox, self).__init__() + + def apply(self, sample, context): + im = sample['image'] + gt_bbox = sample['gt_bbox'] + height, width, _ = im.shape + for i in range(gt_bbox.shape[0]): + gt_bbox[i][0] = gt_bbox[i][0] / width + gt_bbox[i][1] = gt_bbox[i][1] / height + gt_bbox[i][2] = gt_bbox[i][2] / width + gt_bbox[i][3] = gt_bbox[i][3] / height + sample['gt_bbox'] = gt_bbox + + if 'gt_keypoint' in sample.keys(): + gt_keypoint = sample['gt_keypoint'] + + for i in range(gt_keypoint.shape[1]): + if i % 2: + gt_keypoint[:, i] = gt_keypoint[:, i] / height + else: + gt_keypoint[:, i] = gt_keypoint[:, i] / width + sample['gt_keypoint'] = gt_keypoint + + return sample + + +@register_op +class BboxXYXY2XYWH(BaseOperator): + """ + Convert bbox XYXY format to XYWH format. + """ + + def __init__(self): + super(BboxXYXY2XYWH, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox = sample['gt_bbox'] + bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2] + bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2. + sample['gt_bbox'] = bbox + return sample + + +@register_op +class PadBox(BaseOperator): + def __init__(self, num_max_boxes=50): + """ + Pad zeros to bboxes if number of bboxes is less than num_max_boxes. + Args: + num_max_boxes (int): the max number of bboxes + """ + self.num_max_boxes = num_max_boxes + super(PadBox, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox = sample['gt_bbox'] + gt_num = min(self.num_max_boxes, len(bbox)) + num_max = self.num_max_boxes + # fields = context['fields'] if context else [] + pad_bbox = np.zeros((num_max, 4), dtype=np.float32) + if gt_num > 0: + pad_bbox[:gt_num, :] = bbox[:gt_num, :] + sample['gt_bbox'] = pad_bbox + if 'gt_class' in sample: + pad_class = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_class[:gt_num] = sample['gt_class'][:gt_num, 0] + sample['gt_class'] = pad_class + if 'gt_score' in sample: + pad_score = np.zeros((num_max, ), dtype=np.float32) + if gt_num > 0: + pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] + sample['gt_score'] = pad_score + # in training, for example in op ExpandImage, + # the bbox and gt_class is expandded, but the difficult is not, + # so, judging by it's length + if 'difficult' in sample: + pad_diff = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] + sample['difficult'] = pad_diff + if 'is_crowd' in sample: + pad_crowd = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] + sample['is_crowd'] = pad_crowd + return sample + + +@register_op +class DebugVisibleImage(BaseOperator): + """ + In debug mode, visualize images according to `gt_box`. + (Currently only supported when not cropping and flipping image.) + """ + + def __init__(self, output_dir='output/debug', is_normalized=False): + super(DebugVisibleImage, self).__init__() + self.is_normalized = is_normalized + self.output_dir = output_dir + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + if not isinstance(self.is_normalized, bool): + raise TypeError("{}: input type is invalid.".format(self)) + + def apply(self, sample, context=None): + image = Image.open(sample['im_file']).convert('RGB') + out_file_name = sample['im_file'].split('/')[-1] + width = sample['w'] + height = sample['h'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + draw = ImageDraw.Draw(image) + for i in range(gt_bbox.shape[0]): + if self.is_normalized: + gt_bbox[i][0] = gt_bbox[i][0] * width + gt_bbox[i][1] = gt_bbox[i][1] * height + gt_bbox[i][2] = gt_bbox[i][2] * width + gt_bbox[i][3] = gt_bbox[i][3] * height + + xmin, ymin, xmax, ymax = gt_bbox[i] + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill='green') + # draw label + text = str(gt_class[i][0]) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + if 'gt_keypoint' in sample.keys(): + gt_keypoint = sample['gt_keypoint'] + if self.is_normalized: + for i in range(gt_keypoint.shape[1]): + if i % 2: + gt_keypoint[:, i] = gt_keypoint[:, i] * height + else: + gt_keypoint[:, i] = gt_keypoint[:, i] * width + for i in range(gt_keypoint.shape[0]): + keypoint = gt_keypoint[i] + for j in range(int(keypoint.shape[0] / 2)): + x1 = round(keypoint[2 * j]).astype(np.int32) + y1 = round(keypoint[2 * j + 1]).astype(np.int32) + draw.ellipse( + (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') + save_path = os.path.join(self.output_dir, out_file_name) + image.save(save_path, quality=95) + return sample + + +@register_op +class Pad(BaseOperator): + def __init__(self, + size=None, + size_divisor=32, + pad_mode=0, + offsets=None, + fill_value=(127.5, 127.5, 127.5)): + """ + Pad image to a specified size or multiple of size_divisor. + Args: + size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None + size_divisor (int): size divisor, default 32 + pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets + if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top + offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 + fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) + """ + super(Pad, self).__init__() + + if not isinstance(size, (int, Sequence)): + raise TypeError( + "Type of target_size is invalid when random_size is True. \ + Must be List, now is {}".format(type(size))) + + if isinstance(size, int): + size = [size, size] + + assert pad_mode in [ + -1, 0, 1, 2 + ], 'currently only supports four modes [-1, 0, 1, 2]' + assert pad_mode == -1 and offsets, 'if pad_mode is -1, offsets should not be None' + + self.size = size + self.size_divisor = size_divisor + self.pad_mode = pad_mode + self.fill_value = fill_value + self.offsets = offsets + + def apply_segm(self, segms, offsets, im_size, size): + def _expand_poly(poly, x, y): + expanded_poly = np.array(poly) + expanded_poly[0::2] += x + expanded_poly[1::2] += y + return expanded_poly.tolist() + + def _expand_rle(rle, x, y, height, width, h, w): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + expanded_mask = np.full((h, w), 0).astype(mask.dtype) + expanded_mask[y:y + height, x:x + width] = mask + rle = mask_util.encode( + np.array( + expanded_mask, order='F', dtype=np.uint8)) + return rle + + x, y = offsets + height, width = im_size + h, w = size + expanded_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + expanded_segms.append( + [_expand_poly(poly, x, y) for poly in segm]) + else: + # RLE format + import pycocotools.mask as mask_util + expanded_segms.append( + _expand_rle(segm, x, y, height, width, h, w)) + return expanded_segms + + def apply_bbox(self, bbox, offsets): + return bbox + np.array(offsets * 2, dtype=np.float32) + + def apply_keypoint(self, keypoints, offsets): + n = len(keypoints[0]) // 2 + return keypoints + np.array(offsets * n, dtype=np.float32) + + def apply_image(self, image, offsets, im_size, size): + x, y = offsets + im_h, im_w = im_size + h, w = size + canvas = np.ones((h, w, 3), dtype=np.float32) + canvas *= np.array(self.fill_value, dtype=np.float32) + canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) + return canvas + + def apply(self, sample, context=None): + im = sample['image'] + im_h, im_w = im.shape[:2] + if self.size: + h, w = self.size + assert ( + im_h < h and im_w < w + ), '(h, w) of target size should be greater than (im_h, im_w)' + else: + h = np.ceil(im_h // self.size_divisor) * self.size_divisor + w = np.ceil(im_w / self.size_divisor) * self.size_divisor + + if h == im_h and w == im_w: + return sample + + if self.pad_mode == -1: + offset_x, offset_y = self.offsets + elif self.pad_mode == 0: + offset_y, offset_x = 0, 0 + elif self.pad_mode == 1: + offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 + else: + offset_y, offset_x = h - im_h, w - im_w + + offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] + + sample['image'] = self.apply_image(im, offsets, im_size, size) + + if self.pad_mode == 0: + return sample + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets) + + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets, + im_size, size) + + if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: + sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'], + offsets) + + return sample + + +@register_op +class Poly2Mask(BaseOperator): + """ + gt poly to mask annotations + """ + + def __init__(self): + super(Poly2Mask, self).__init__() + import pycocotools.mask as maskUtils + self.maskutils = maskUtils + + def _poly2mask(self, mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + rle = self.maskutils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = self.maskutils.decode(rle) + return mask + + def apply(self, sample, context=None): + assert 'gt_poly' in sample + im_h = sample['h'] + im_w = sample['w'] + masks = [ + self._poly2mask(gt_poly, im_h, im_w) + for gt_poly in sample['gt_poly'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + return sample + + +@register_op +class Rbox2Poly(BaseOperator): + """ + Convert rbbox format to poly format. + """ + + def __init__(self): + super(Rbox2Poly, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_rbox' in sample + assert sample['gt_rbox'].shape[1] == 5 + rrects = sample['gt_rbox'] + x_ctr = rrects[:, 0] + y_ctr = rrects[:, 1] + width = rrects[:, 2] + height = rrects[:, 3] + x1 = x_ctr - width / 2.0 + y1 = y_ctr - height / 2.0 + x2 = x_ctr + width / 2.0 + y2 = y_ctr + height / 2.0 + sample['gt_bbox'] = np.stack([x1, y1, x2, y2], axis=1) + polys = bbox_utils.rbox2poly(rrects) + sample['gt_rbox2poly'] = polys + return sample diff --git a/ppdet/engine/__init__.py b/ppdet/engine/__init__.py new file mode 100644 index 0000000..dfded9e --- /dev/null +++ b/ppdet/engine/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import trainer +from .trainer import * + +from . import callbacks +from .callbacks import * + +from . import env +from .env import * + +__all__ = trainer.__all__ \ + + callbacks.__all__ \ + + env.__all__ diff --git a/ppdet/engine/__pycache__/__init__.cpython-38.pyc b/ppdet/engine/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..ed1e47c Binary files /dev/null and b/ppdet/engine/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/engine/__pycache__/__init__.cpython-39.pyc b/ppdet/engine/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..c537d75 Binary files /dev/null and b/ppdet/engine/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/engine/__pycache__/callbacks.cpython-38.pyc b/ppdet/engine/__pycache__/callbacks.cpython-38.pyc new file mode 100644 index 0000000..df62806 Binary files /dev/null and b/ppdet/engine/__pycache__/callbacks.cpython-38.pyc differ diff --git a/ppdet/engine/__pycache__/callbacks.cpython-39.pyc b/ppdet/engine/__pycache__/callbacks.cpython-39.pyc new file mode 100644 index 0000000..9ae89a9 Binary files /dev/null and b/ppdet/engine/__pycache__/callbacks.cpython-39.pyc differ diff --git a/ppdet/engine/__pycache__/env.cpython-38.pyc b/ppdet/engine/__pycache__/env.cpython-38.pyc new file mode 100644 index 0000000..0f92a66 Binary files /dev/null and b/ppdet/engine/__pycache__/env.cpython-38.pyc differ diff --git a/ppdet/engine/__pycache__/env.cpython-39.pyc b/ppdet/engine/__pycache__/env.cpython-39.pyc new file mode 100644 index 0000000..117bef0 Binary files /dev/null and b/ppdet/engine/__pycache__/env.cpython-39.pyc differ diff --git a/ppdet/engine/__pycache__/export_utils.cpython-38.pyc b/ppdet/engine/__pycache__/export_utils.cpython-38.pyc new file mode 100644 index 0000000..7d97e7b Binary files /dev/null and b/ppdet/engine/__pycache__/export_utils.cpython-38.pyc differ diff --git a/ppdet/engine/__pycache__/export_utils.cpython-39.pyc b/ppdet/engine/__pycache__/export_utils.cpython-39.pyc new file mode 100644 index 0000000..44f34d2 Binary files /dev/null and b/ppdet/engine/__pycache__/export_utils.cpython-39.pyc differ diff --git a/ppdet/engine/__pycache__/trainer.cpython-38.pyc b/ppdet/engine/__pycache__/trainer.cpython-38.pyc new file mode 100644 index 0000000..7a9354b Binary files /dev/null and b/ppdet/engine/__pycache__/trainer.cpython-38.pyc differ diff --git a/ppdet/engine/__pycache__/trainer.cpython-39.pyc b/ppdet/engine/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000..e3515bc Binary files /dev/null and b/ppdet/engine/__pycache__/trainer.cpython-39.pyc differ diff --git a/ppdet/engine/callbacks.py b/ppdet/engine/callbacks.py new file mode 100644 index 0000000..0798b91 --- /dev/null +++ b/ppdet/engine/callbacks.py @@ -0,0 +1,252 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import datetime +import six +import numpy as np + +import paddle +import paddle.distributed as dist + +from ppdet.utils.checkpoint import save_model + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +__all__ = ['Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer'] + + +class Callback(object): + def __init__(self, model): + self.model = model + + def on_step_begin(self, status): + pass + + def on_step_end(self, status): + pass + + def on_epoch_begin(self, status): + pass + + def on_epoch_end(self, status): + pass + + +class ComposeCallback(object): + def __init__(self, callbacks): + callbacks = [c for c in list(callbacks) if c is not None] + for c in callbacks: + assert isinstance( + c, Callback), "callback should be subclass of Callback" + self._callbacks = callbacks + + def on_step_begin(self, status): + for c in self._callbacks: + c.on_step_begin(status) + + def on_step_end(self, status): + for c in self._callbacks: + c.on_step_end(status) + + def on_epoch_begin(self, status): + for c in self._callbacks: + c.on_epoch_begin(status) + + def on_epoch_end(self, status): + for c in self._callbacks: + c.on_epoch_end(status) + + +class LogPrinter(Callback): + def __init__(self, model): + super(LogPrinter, self).__init__(model) + + def on_step_end(self, status): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + mode = status['mode'] + if mode == 'train': + epoch_id = status['epoch_id'] + step_id = status['step_id'] + steps_per_epoch = status['steps_per_epoch'] + training_staus = status['training_staus'] + batch_time = status['batch_time'] + data_time = status['data_time'] + + epoches = self.model.cfg.epoch + batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( + ))]['batch_size'] + + logs = training_staus.log() + space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' + if step_id % self.model.cfg.log_iter == 0: + eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id + eta_sec = eta_steps * batch_time.global_avg + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + ips = float(batch_size) / batch_time.avg + fmt = ' '.join([ + 'Epoch: [{}]', + '[{' + space_fmt + '}/{}]', + 'learning_rate: {lr:.6f}', + '{meters}', + 'eta: {eta}', + 'batch_cost: {btime}', + 'data_cost: {dtime}', + 'ips: {ips:.4f} images/s', + ]) + fmt = fmt.format( + epoch_id, + step_id, + steps_per_epoch, + lr=status['learning_rate'], + meters=logs, + eta=eta_str, + btime=str(batch_time), + dtime=str(data_time), + ips=ips) + logger.info(fmt) + if mode == 'eval': + step_id = status['step_id'] + if step_id % 100 == 0: + logger.info("Eval iter: {}".format(step_id)) + + def on_epoch_end(self, status): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + mode = status['mode'] + if mode == 'eval': + sample_num = status['sample_num'] + cost_time = status['cost_time'] + logger.info('Total sample number: {}, averge FPS: {}'.format( + sample_num, sample_num / cost_time)) + + +class Checkpointer(Callback): + def __init__(self, model): + super(Checkpointer, self).__init__(model) + cfg = self.model.cfg + self.best_ap = 0. + self.save_dir = os.path.join(self.model.cfg.save_dir, + self.model.cfg.filename) + if hasattr(self.model.model, 'student_model'): + self.weight = self.model.model.student_model + else: + self.weight = self.model.model + + def on_epoch_end(self, status): + # Checkpointer only performed during training + mode = status['mode'] + epoch_id = status['epoch_id'] + weight = None + save_name = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + end_epoch = self.model.cfg.epoch + if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: + save_name = str( + epoch_id) if epoch_id != end_epoch - 1 else "model_final" + weight = self.weight + elif mode == 'eval': + if 'save_best_model' in status and status['save_best_model']: + for metric in self.model._metrics: + map_res = metric.get_results() + key = 'bbox' if 'bbox' in map_res else 'mask' + if key not in map_res: + logger.warn("Evaluation results empty, this may be due to " \ + "training iterations being too few or not " \ + "loading the correct weights.") + return + if map_res[key][0] > self.best_ap: + self.best_ap = map_res[key][0] + save_name = 'best_model' + weight = self.weight + logger.info("Best test {} ap is {:0.3f}.".format( + key, self.best_ap)) + if weight: + save_model(weight, self.model.optimizer, self.save_dir, + save_name, epoch_id + 1) + + +class WiferFaceEval(Callback): + def __init__(self, model): + super(WiferFaceEval, self).__init__(model) + + def on_epoch_begin(self, status): + assert self.model.mode == 'eval', \ + "WiferFaceEval can only be set during evaluation" + for metric in self.model._metrics: + metric.update(self.model.model) + sys.exit() + + +class VisualDLWriter(Callback): + """ + Use VisualDL to log data or image + """ + + def __init__(self, model): + super(VisualDLWriter, self).__init__(model) + + assert six.PY3, "VisualDL requires Python >= 3.5" + try: + from visualdl import LogWriter + except Exception as e: + logger.error('visualdl not found, plaese install visualdl. ' + 'for example: `pip install visualdl`.') + raise e + self.vdl_writer = LogWriter(model.cfg.vdl_log_dir) + self.vdl_loss_step = 0 + self.vdl_mAP_step = 0 + self.vdl_image_step = 0 + self.vdl_image_frame = 0 + + def on_step_end(self, status): + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + training_staus = status['training_staus'] + for loss_name, loss_value in training_staus.get().items(): + self.vdl_writer.add_scalar(loss_name, loss_value, + self.vdl_loss_step) + self.vdl_loss_step += 1 + elif mode == 'test': + ori_image = status['original_image'] + result_image = status['result_image'] + self.vdl_writer.add_image( + "original/frame_{}".format(self.vdl_image_frame), ori_image, + self.vdl_image_step) + self.vdl_writer.add_image( + "result/frame_{}".format(self.vdl_image_frame), + result_image, self.vdl_image_step) + self.vdl_image_step += 1 + # each frame can display ten pictures at most. + if self.vdl_image_step % 10 == 0: + self.vdl_image_step = 0 + self.vdl_image_frame += 1 + + def on_epoch_end(self, status): + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'eval': + for metric in self.model._metrics: + for key, map_value in metric.get_results().items(): + self.vdl_writer.add_scalar("{}-mAP".format(key), + map_value[0], + self.vdl_mAP_step) + self.vdl_mAP_step += 1 diff --git a/ppdet/engine/env.py b/ppdet/engine/env.py new file mode 100644 index 0000000..cfeea08 --- /dev/null +++ b/ppdet/engine/env.py @@ -0,0 +1,47 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random +import numpy as np + +import paddle +from paddle.distributed import fleet + +__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env'] + + +def init_fleet_env(): + fleet.init(is_collective=True) + + +def init_parallel_env(): + env = os.environ + dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env + if dist: + trainer_id = int(env['PADDLE_TRAINER_ID']) + local_seed = (99 + trainer_id) + random.seed(local_seed) + np.random.seed(local_seed) + + paddle.distributed.init_parallel_env() + + +def set_random_seed(seed): + random.seed(seed) + np.random.seed(seed) diff --git a/ppdet/engine/export_utils.py b/ppdet/engine/export_utils.py new file mode 100644 index 0000000..744775c --- /dev/null +++ b/ppdet/engine/export_utils.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import yaml +from collections import OrderedDict + +from ppdet.data.source.category import get_categories + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +# Global dictionary +TRT_MIN_SUBGRAPH = { + 'YOLO': 3, + 'SSD': 60, + 'RCNN': 40, + 'RetinaNet': 40, + 'S2ANet': 40, + 'EfficientDet': 40, + 'Face': 3, + 'TTFNet': 3, + 'FCOS': 16, + 'SOLOv2': 60, +} + + +def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape): + preprocess_list = [] + + anno_file = dataset_cfg.get_anno() + + clsid2catid, catid2name = get_categories(metric, anno_file) + + label_list = [str(cat) for cat in catid2name.values()] + + sample_transforms = reader_cfg['sample_transforms'] + for st in sample_transforms[1:]: + for key, value in st.items(): + p = {'type': key} + p.update(value) + preprocess_list.append(p) + batch_transforms = reader_cfg.get('batch_transforms', None) + if batch_transforms: + methods = [list(bt.keys())[0] for bt in batch_transforms] + for bt in batch_transforms: + for key, value in bt.items(): + # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride, pad_gt) + if key == 'PadBatch': + preprocess_list.append({ + 'type': 'PadStride', + 'stride': value['pad_to_stride'] + }) + break + + return preprocess_list, label_list, image_shape + + +def _dump_infer_config(config, path, image_shape, model): + arch_state = False + from ppdet.core.config.yaml_helpers import setup_orderdict + setup_orderdict() + infer_cfg = OrderedDict({ + 'mode': 'fluid', + 'draw_threshold': 0.5, + 'metric': config['metric'], + 'image_shape': image_shape + }) + infer_arch = config['architecture'] + + for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items(): + if arch in infer_arch: + infer_cfg['arch'] = arch + infer_cfg['min_subgraph_size'] = min_subgraph_size + arch_state = True + break + if not arch_state: + logger.error( + 'Architecture: {} is not supported for exporting model now'.format( + infer_arch)) + os._exit(0) + if 'Mask' in infer_arch: + infer_cfg['mask'] = True + infer_cfg['Preprocess'], infer_cfg[ + 'label_list'], image_shape = _parse_reader( + config['TestReader'], config['TestDataset'], config['metric'], + infer_cfg['arch'], image_shape) + + yaml.dump(infer_cfg, open(path, 'w')) + logger.info("Export inference config file to {}".format(os.path.join(path))) + return image_shape diff --git a/ppdet/engine/trainer.py b/ppdet/engine/trainer.py new file mode 100644 index 0000000..2b17cde --- /dev/null +++ b/ppdet/engine/trainer.py @@ -0,0 +1,488 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import random +import datetime +import numpy as np +from PIL import Image + +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet +from paddle import amp +from paddle.static import InputSpec +from ppdet.optimizer import ModelEMA + +from ppdet.core.workspace import create +from ppdet.utils.checkpoint import load_weight, load_pretrain_weight +from ppdet.utils.visualizer import visualize_results, save_result +from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, get_infer_results +from ppdet.data.source.category import get_categories +import ppdet.utils.stats as stats + +from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter +from .export_utils import _dump_infer_config + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +__all__ = ['Trainer'] + + +class Trainer(object): + def __init__(self, cfg, mode='train'): + self.cfg = cfg + assert mode.lower() in ['train', 'eval', 'test'], \ + "mode should be 'train', 'eval' or 'test'" + self.mode = mode.lower() + self.optimizer = None + self.is_loaded_weights = False + + # build model + if 'model' not in self.cfg: + self.model = create(cfg.architecture) + else: + self.model = self.cfg.model + self.is_loaded_weights = True + + self.use_ema = ('use_ema' in cfg and cfg['use_ema']) + if self.use_ema: + self.ema = ModelEMA( + cfg['ema_decay'], self.model, use_thres_step=True) + + # build data loader + self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] + if self.mode == 'train': + self.loader = create('{}Reader'.format(self.mode.capitalize()))( + self.dataset, cfg.worker_num) + # EvalDataset build with BatchSampler to evaluate in single device + # TODO: multi-device evaluate + if self.mode == 'eval': + self._eval_batch_sampler = paddle.io.BatchSampler( + self.dataset, batch_size=self.cfg.EvalReader['batch_size']) + self.loader = create('{}Reader'.format(self.mode.capitalize()))( + self.dataset, cfg.worker_num, self._eval_batch_sampler) + # TestDataset build after user set images, skip loader creation here + + # build optimizer in train mode + if self.mode == 'train': + steps_per_epoch = len(self.loader) + self.lr = create('LearningRate')(steps_per_epoch) + self.optimizer = create('OptimizerBuilder')(self.lr, + self.model.parameters()) + + self._nranks = dist.get_world_size() + self._local_rank = dist.get_rank() + + self.status = {} + + self.start_epoch = 0 + self.end_epoch = cfg.epoch + + # initial default callbacks + self._init_callbacks() + + # initial default metrics + self._init_metrics() + self._reset_metrics() + + def _init_callbacks(self): + if self.mode == 'train': + self._callbacks = [LogPrinter(self), Checkpointer(self)] + if 'use_vdl' in self.cfg and self.cfg.use_vdl: + self._callbacks.append(VisualDLWriter(self)) + self._compose_callback = ComposeCallback(self._callbacks) + elif self.mode == 'eval': + self._callbacks = [LogPrinter(self)] + if self.cfg.metric == 'WiderFace': + self._callbacks.append(WiferFaceEval(self)) + self._compose_callback = ComposeCallback(self._callbacks) + elif self.mode == 'test' and 'use_vdl' in self.cfg and self.cfg.use_vdl: + self._callbacks = [VisualDLWriter(self)] + self._compose_callback = ComposeCallback(self._callbacks) + else: + self._callbacks = [] + self._compose_callback = None + + def _init_metrics(self, validate=False): + if self.mode == 'test' or (self.mode == 'train' and not validate): + self._metrics = [] + return + classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False + if self.cfg.metric == 'COCO': + # TODO: bias should be unified + bias = self.cfg['bias'] if 'bias' in self.cfg else 0 + output_eval = self.cfg['output_eval'] \ + if 'output_eval' in self.cfg else None + save_prediction_only = self.cfg['save_prediction_only'] \ + if 'save_prediction_only' in self.cfg else False + + # pass clsid2catid info to metric instance to avoid multiple loading + # annotation file + clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \ + if self.mode == 'eval' else None + + # when do validation in train, annotation file should be get from + # EvalReader instead of self.dataset(which is TrainReader) + anno_file = self.dataset.get_anno() + if self.mode == 'train' and validate: + eval_dataset = self.cfg['EvalDataset'] + eval_dataset.check_or_download_dataset() + anno_file = eval_dataset.get_anno() + + self._metrics = [ + COCOMetric( + anno_file=anno_file, + clsid2catid=clsid2catid, + classwise=classwise, + output_eval=output_eval, + bias=bias, + save_prediction_only=save_prediction_only) + ] + elif self.cfg.metric == 'VOC': + self._metrics = [ + VOCMetric( + label_list=self.dataset.get_label_list(), + class_num=self.cfg.num_classes, + map_type=self.cfg.map_type, + classwise=classwise) + ] + elif self.cfg.metric == 'WiderFace': + multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True + self._metrics = [ + WiderFaceMetric( + image_dir=os.path.join(self.dataset.dataset_dir, + self.dataset.image_dir), + anno_file=self.dataset.get_anno(), + multi_scale=multi_scale) + ] + else: + logger.warn("Metric not support for metric type {}".format( + self.cfg.metric)) + self._metrics = [] + + def _reset_metrics(self): + for metric in self._metrics: + metric.reset() + + def register_callbacks(self, callbacks): + callbacks = [c for c in list(callbacks) if c is not None] + for c in callbacks: + assert isinstance(c, Callback), \ + "metrics shoule be instances of subclass of Metric" + self._callbacks.extend(callbacks) + self._compose_callback = ComposeCallback(self._callbacks) + + def register_metrics(self, metrics): + metrics = [m for m in list(metrics) if m is not None] + for m in metrics: + assert isinstance(m, Metric), \ + "metrics shoule be instances of subclass of Metric" + self._metrics.extend(metrics) + + def load_weights(self, weights): + if self.is_loaded_weights: + return + self.start_epoch = 0 + load_pretrain_weight(self.model, weights) + logger.debug("Load weights {} to start training".format(weights)) + + def resume_weights(self, weights): + # support Distill resume weights + if hasattr(self.model, 'student_model'): + self.start_epoch = load_weight(self.model.student_model, weights, + self.optimizer) + else: + self.start_epoch = load_weight(self.model, weights, self.optimizer) + logger.debug("Resume weights of epoch {}".format(self.start_epoch)) + + def train(self, validate=False): + assert self.mode == 'train', "Model not in 'train' mode" + + # if validation in training is enabled, metrics should be re-init + if validate: + self._init_metrics(validate=validate) + self._reset_metrics() + + model = self.model + if self.cfg.fleet: + model = fleet.distributed_model(model) + self.optimizer = fleet.distributed_optimizer( + self.optimizer).user_defined_optimizer + elif self._nranks > 1: + model = paddle.DataParallel(self.model) + + # initial fp16 + if self.cfg.fp16: + scaler = amp.GradScaler( + enable=self.cfg.use_gpu, init_loss_scaling=1024) + + self.status.update({ + 'epoch_id': self.start_epoch, + 'step_id': 0, + 'steps_per_epoch': len(self.loader) + }) + + self.status['batch_time'] = stats.SmoothedValue( + self.cfg.log_iter, fmt='{avg:.4f}') + self.status['data_time'] = stats.SmoothedValue( + self.cfg.log_iter, fmt='{avg:.4f}') + self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) + + for epoch_id in range(self.start_epoch, self.cfg.epoch): + self.status['mode'] = 'train' + self.status['epoch_id'] = epoch_id + self._compose_callback.on_epoch_begin(self.status) + self.loader.dataset.set_epoch(epoch_id) + model.train() + iter_tic = time.time() + for step_id, data in enumerate(self.loader): + self.status['data_time'].update(time.time() - iter_tic) + self.status['step_id'] = step_id + self._compose_callback.on_step_begin(self.status) + + if self.cfg.fp16: + with amp.auto_cast(enable=self.cfg.use_gpu): + # model forward + outputs = model(data) + loss = outputs['loss'] + + # model backward + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + # in dygraph mode, optimizer.minimize is equal to optimizer.step + scaler.minimize(self.optimizer, scaled_loss) + else: + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + loss.backward() + self.optimizer.step() + + curr_lr = self.optimizer.get_lr() + self.lr.step() + self.optimizer.clear_grad() + self.status['learning_rate'] = curr_lr + + if self._nranks < 2 or self._local_rank == 0: + self.status['training_staus'].update(outputs) + + self.status['batch_time'].update(time.time() - iter_tic) + self._compose_callback.on_step_end(self.status) + if self.use_ema: + self.ema.update(self.model) + iter_tic = time.time() + + # apply ema weight on model + if self.use_ema: + weight = self.model.state_dict() + self.model.set_dict(self.ema.apply()) + + self._compose_callback.on_epoch_end(self.status) + + if validate and (self._nranks < 2 or self._local_rank == 0) \ + and (epoch_id % self.cfg.snapshot_epoch == 0 \ + or epoch_id == self.end_epoch - 1): + if not hasattr(self, '_eval_loader'): + # build evaluation dataset and loader + self._eval_dataset = self.cfg.EvalDataset + self._eval_batch_sampler = \ + paddle.io.BatchSampler( + self._eval_dataset, + batch_size=self.cfg.EvalReader['batch_size']) + self._eval_loader = create('EvalReader')( + self._eval_dataset, + self.cfg.worker_num, + batch_sampler=self._eval_batch_sampler) + with paddle.no_grad(): + self.status['save_best_model'] = True + self._eval_with_loader(self._eval_loader) + + # restore origin weight on model + if self.use_ema: + self.model.set_dict(weight) + + def _eval_with_loader(self, loader): + sample_num = 0 + tic = time.time() + self._compose_callback.on_epoch_begin(self.status) + self.status['mode'] = 'eval' + self.model.eval() + for step_id, data in enumerate(loader): + self.status['step_id'] = step_id + self._compose_callback.on_step_begin(self.status) + # forward + outs = self.model(data) + + # update metrics + for metric in self._metrics: + metric.update(data, outs) + + sample_num += data['im_id'].numpy().shape[0] + self._compose_callback.on_step_end(self.status) + + self.status['sample_num'] = sample_num + self.status['cost_time'] = time.time() - tic + + # accumulate metric to log out + for metric in self._metrics: + metric.accumulate() + metric.log() + self._compose_callback.on_epoch_end(self.status) + # reset metric states for metric may performed multiple times + self._reset_metrics() + + def evaluate(self): + self._eval_with_loader(self.loader) + + def predict(self, + images, + draw_threshold=0.5, + output_dir='output', + save_txt=False): + self.dataset.set_images(images) + loader = create('TestReader')(self.dataset, 0) + + imid2path = self.dataset.get_imid2path() + + anno_file = self.dataset.get_anno() + clsid2catid, catid2name = get_categories(self.cfg.metric, anno_file) + + # Run Infer + self.status['mode'] = 'test' + self.model.eval() + for step_id, data in enumerate(loader): + self.status['step_id'] = step_id + # forward + outs = self.model(data) + for key in ['im_shape', 'scale_factor', 'im_id']: + outs[key] = data[key] + for key, value in outs.items(): + outs[key] = value.numpy() + + batch_res = get_infer_results(outs, clsid2catid) + bbox_num = outs['bbox_num'] + start = 0 + for i, im_id in enumerate(outs['im_id']): + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + self.status['original_image'] = np.array(image.copy()) + + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + mask_res = batch_res['mask'][start:end] \ + if 'mask' in batch_res else None + segm_res = batch_res['segm'][start:end] \ + if 'segm' in batch_res else None + + image = visualize_results(image, bbox_res, mask_res, segm_res, + int(outs['im_id']), catid2name, + draw_threshold) + self.status['result_image'] = np.array(image.copy()) + if self._compose_callback: + self._compose_callback.on_step_end(self.status) + # save image with detection + save_name = self._get_save_image_name(output_dir, image_path) + logger.info("Detection bbox results save in {}".format( + save_name)) + image.save(save_name, quality=95) + if save_txt: + save_path = os.path.splitext(save_name)[0] + '.txt' + save_result(save_path, bbox_res, catid2name, draw_threshold) + start = end + + def _get_save_image_name(self, output_dir, image_path): + """ + Get save image name from source image path. + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + image_name = os.path.split(image_path)[-1] + name, ext = os.path.splitext(image_name) + return os.path.join(output_dir, "{}".format(name)) + ext + + def export(self, output_dir='output_inference'): + self.model.eval() + model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] + save_dir = os.path.join(output_dir, model_name) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + image_shape = None + if 'inputs_def' in self.cfg['TestReader']: + inputs_def = self.cfg['TestReader']['inputs_def'] + image_shape = inputs_def.get('image_shape', None) + # set image_shape=[3, -1, -1] as default + if image_shape is None: + image_shape = [3, -1, -1] + + self.model.eval() + + # Save infer cfg + _dump_infer_config(self.cfg, + os.path.join(save_dir, 'infer_cfg.yml'), image_shape, + self.model) + + input_spec = [{ + "image": InputSpec( + shape=[None] + image_shape, name='image'), + "im_shape": InputSpec( + shape=[None, 2], name='im_shape'), + "scale_factor": InputSpec( + shape=[None, 2], name='scale_factor') + }] + + # dy2st and save model + if 'slim' not in self.cfg or self.cfg['slim'] != 'QAT': + static_model = paddle.jit.to_static( + self.model, input_spec=input_spec) + # NOTE: dy2st do not pruned program, but jit.save will prune program + # input spec, prune input spec here and save with pruned input spec + pruned_input_spec = self._prune_input_spec( + input_spec, static_model.forward.main_program, + static_model.forward.outputs) + paddle.jit.save( + static_model, + os.path.join(save_dir, 'model'), + input_spec=pruned_input_spec) + logger.info("Export model and saved in {}".format(save_dir)) + else: + self.cfg.slim.save_quantized_model( + self.model, + os.path.join(save_dir, 'model'), + input_spec=input_spec) + + def _prune_input_spec(self, input_spec, program, targets): + # try to prune static program to figure out pruned input spec + # so we perform following operations in static mode + paddle.enable_static() + pruned_input_spec = [{}] + program = program.clone() + program = program._prune(targets=targets) + global_block = program.global_block() + for name, spec in input_spec[0].items(): + try: + v = global_block.var(name) + pruned_input_spec[0][name] = spec + except Exception: + pass + paddle.disable_static() + return pruned_input_spec diff --git a/ppdet/ext_op/README.md b/ppdet/ext_op/README.md new file mode 100644 index 0000000..7ada0ac --- /dev/null +++ b/ppdet/ext_op/README.md @@ -0,0 +1,38 @@ +# 自定义OP编译 +旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/07_new_op/new_custom_op.html) 。 + +## 1. 环境依赖 +- Paddle >= 2.0.1 +- gcc 8.2 + +## 2. 安装 +``` +python3.7 setup.py install +``` + +按照如下方式使用 +``` +# 引入自定义op +from rbox_iou_ops import rbox_iou + +paddle.set_device('gpu:0') +paddle.disable_static() + +rbox1 = np.random.rand(13000, 5) +rbox2 = np.random.rand(7, 5) + +pd_rbox1 = paddle.to_tensor(rbox1) +pd_rbox2 = paddle.to_tensor(rbox2) + +iou = rbox_iou(pd_rbox1, pd_rbox2) +print('iou', iou) +``` + +## 3. 单元测试 +单元测试`test.py`文件中,通过对比python实现的结果和测试自定义op结果。 + +由于python计算细节与cpp计算细节略有区别,误差区间设置为0.02。 +``` +python3.7 test.py +``` +提示`rbox_iou OP compute right!`说明OP测试通过。 diff --git a/ppdet/ext_op/rbox_iou_op.cc b/ppdet/ext_op/rbox_iou_op.cc new file mode 100644 index 0000000..05890fd --- /dev/null +++ b/ppdet/ext_op/rbox_iou_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/extension.h" + +#include + +std::vector RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2); +std::vector RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2); + + +#define CHECK_INPUT_SAME(x1, x2) PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") +std::vector RboxIouForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { + CHECK_INPUT_SAME(rbox1, rbox2); + if (rbox1.place() == paddle::PlaceType::kCPU) { + return RboxIouCPUForward(rbox1, rbox2); + } + else if (rbox1.place() == paddle::PlaceType::kGPU) { + return RboxIouCUDAForward(rbox1, rbox2); + } +} + +std::vector> InferShape(std::vector rbox1_shape, std::vector rbox2_shape) { + return {{rbox1_shape[0], rbox2_shape[0]}}; +} + +std::vector InferDtype(paddle::DataType t1, paddle::DataType t2) { + return {t1}; +} + +PD_BUILD_OP(rbox_iou) + .Inputs({"RBOX1", "RBOX2"}) + .Outputs({"Output"}) + .SetKernelFn(PD_KERNEL(RboxIouForward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)); diff --git a/ppdet/ext_op/rbox_iou_op.cu b/ppdet/ext_op/rbox_iou_op.cu new file mode 100644 index 0000000..0581f78 --- /dev/null +++ b/ppdet/ext_op/rbox_iou_op.cu @@ -0,0 +1,507 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#include +#include + +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define HOST_DEVICE __host__ __device__ +#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ +#else +#include +#define HOST_DEVICE +#define HOST_DEVICE_INLINE HOST_DEVICE inline +#endif + +#include "paddle/extension.h" + +#include + +namespace { + +template +struct RotatedBox { + T x_ctr, y_ctr, w, h, a; +}; + +template +struct Point { + T x, y; + HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {} + HOST_DEVICE_INLINE Point operator+(const Point& p) const { + return Point(x + p.x, y + p.y); + } + HOST_DEVICE_INLINE Point& operator+=(const Point& p) { + x += p.x; + y += p.y; + return *this; + } + HOST_DEVICE_INLINE Point operator-(const Point& p) const { + return Point(x - p.x, y - p.y); + } + HOST_DEVICE_INLINE Point operator*(const T coeff) const { + return Point(x * coeff, y * coeff); + } +}; + +template +HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) { + return A.x * B.x + A.y * B.y; +} + +template +HOST_DEVICE_INLINE T cross_2d(const Point& A, const Point& B) { + return A.x * B.y - B.x * A.y; +} + +template +HOST_DEVICE_INLINE void get_rotated_vertices( + const RotatedBox& box, + Point (&pts)[4]) { + // M_PI / 180. == 0.01745329251 + //double theta = box.a * 0.01745329251; + //MODIFIED + double theta = box.a; + T cosTheta2 = (T)cos(theta) * 0.5f; + T sinTheta2 = (T)sin(theta) * 0.5f; + + // y: top --> down; x: left --> right + pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; + pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; + pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; + pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; + pts[2].x = 2 * box.x_ctr - pts[0].x; + pts[2].y = 2 * box.y_ctr - pts[0].y; + pts[3].x = 2 * box.x_ctr - pts[1].x; + pts[3].y = 2 * box.y_ctr - pts[1].y; +} + +template +HOST_DEVICE_INLINE int get_intersection_points( + const Point (&pts1)[4], + const Point (&pts2)[4], + Point (&intersections)[24]) { + // Line vector + // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] + Point vec1[4], vec2[4]; + for (int i = 0; i < 4; i++) { + vec1[i] = pts1[(i + 1) % 4] - pts1[i]; + vec2[i] = pts2[(i + 1) % 4] - pts2[i]; + } + + // Line test - test all line combos for intersection + int num = 0; // number of intersections + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + // Solve for 2x2 Ax=b + T det = cross_2d(vec2[j], vec1[i]); + + // This takes care of parallel lines + if (fabs(det) <= 1e-14) { + continue; + } + + auto vec12 = pts2[j] - pts1[i]; + + T t1 = cross_2d(vec2[j], vec12) / det; + T t2 = cross_2d(vec1[i], vec12) / det; + + if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { + intersections[num++] = pts1[i] + vec1[i] * t1; + } + } + } + + // Check for vertices of rect1 inside rect2 + { + const auto& AB = vec2[0]; + const auto& DA = vec2[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + // assume ABCD is the rectangle, and P is the point to be judged + // P is inside ABCD iff. P's projection on AB lies within AB + // and P's projection on AD lies within AD + + auto AP = pts1[i] - pts2[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts1[i]; + } + } + } + + // Reverse the check - check for vertices of rect2 inside rect1 + { + const auto& AB = vec1[0]; + const auto& DA = vec1[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + auto AP = pts2[i] - pts1[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts2[i]; + } + } + } + + return num; +} + +template +HOST_DEVICE_INLINE int convex_hull_graham( + const Point (&p)[24], + const int& num_in, + Point (&q)[24], + bool shift_to_zero = false) { + assert(num_in >= 2); + + // Step 1: + // Find point with minimum y + // if more than 1 points have the same minimum y, + // pick the one with the minimum x. + int t = 0; + for (int i = 1; i < num_in; i++) { + if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { + t = i; + } + } + auto& start = p[t]; // starting point + + // Step 2: + // Subtract starting point from every points (for sorting in the next step) + for (int i = 0; i < num_in; i++) { + q[i] = p[i] - start; + } + + // Swap the starting point to position 0 + auto tmp = q[0]; + q[0] = q[t]; + q[t] = tmp; + + // Step 3: + // Sort point 1 ~ num_in according to their relative cross-product values + // (essentially sorting according to angles) + // If the angles are the same, sort according to their distance to origin + T dist[24]; + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + +#ifdef __CUDACC__ + // CUDA version + // In the future, we can potentially use thrust + // for sorting here to improve speed (though not guaranteed) + for (int i = 1; i < num_in - 1; i++) { + for (int j = i + 1; j < num_in; j++) { + T crossProduct = cross_2d(q[i], q[j]); + if ((crossProduct < -1e-6) || + (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { + auto q_tmp = q[i]; + q[i] = q[j]; + q[j] = q_tmp; + auto dist_tmp = dist[i]; + dist[i] = dist[j]; + dist[j] = dist_tmp; + } + } + } +#else + // CPU version + std::sort( + q + 1, q + num_in, [](const Point& A, const Point& B) -> bool { + T temp = cross_2d(A, B); + if (fabs(temp) < 1e-6) { + return dot_2d(A, A) < dot_2d(B, B); + } else { + return temp > 0; + } + }); +#endif + + // Step 4: + // Make sure there are at least 2 points (that don't overlap with each other) + // in the stack + int k; // index of the non-overlapped second point + for (k = 1; k < num_in; k++) { + if (dist[k] > 1e-8) { + break; + } + } + if (k == num_in) { + // We reach the end, which means the convex hull is just one point + q[0] = p[t]; + return 1; + } + q[1] = q[k]; + int m = 2; // 2 points in the stack + // Step 5: + // Finally we can start the scanning process. + // When a non-convex relationship between the 3 points is found + // (either concave shape or duplicated points), + // we pop the previous point from the stack + // until the 3-point relationship is convex again, or + // until the stack only contains two points + for (int i = k + 1; i < num_in; i++) { + while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { + m--; + } + q[m++] = q[i]; + } + + // Step 6 (Optional): + // In general sense we need the original coordinates, so we + // need to shift the points back (reverting Step 2) + // But if we're only interested in getting the area/perimeter of the shape + // We can simply return. + if (!shift_to_zero) { + for (int i = 0; i < m; i++) { + q[i] += start; + } + } + + return m; +} + +template +HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) { + if (m <= 2) { + return 0; + } + + T area = 0; + for (int i = 1; i < m - 1; i++) { + area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.0; +} + +template +HOST_DEVICE_INLINE T rboxes_intersection( + const RotatedBox& box1, + const RotatedBox& box2) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned + // from rotated_rect_intersection_pts + Point intersectPts[24], orderedPts[24]; + + Point pts1[4]; + Point pts2[4]; + get_rotated_vertices(box1, pts1); + get_rotated_vertices(box2, pts2); + + int num = get_intersection_points(pts1, pts2, intersectPts); + + if (num <= 2) { + return 0.0; + } + + // Convex Hull to order the intersection points in clockwise order and find + // the contour area. + int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); + return polygon_area(orderedPts, num_convex); +} + +} // namespace + +template +HOST_DEVICE_INLINE T +rbox_iou_single(T const* const box1_raw, T const* const box2_raw) { + // shift center to the middle point to achieve higher precision in result + RotatedBox box1, box2; + auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; + auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; + box1.x_ctr = box1_raw[0] - center_shift_x; + box1.y_ctr = box1_raw[1] - center_shift_y; + box1.w = box1_raw[2]; + box1.h = box1_raw[3]; + box1.a = box1_raw[4]; + box2.x_ctr = box2_raw[0] - center_shift_x; + box2.y_ctr = box2_raw[1] - center_shift_y; + box2.w = box2_raw[2]; + box2.h = box2_raw[3]; + box2.a = box2_raw[4]; + + const T area1 = box1.w * box1.h; + const T area2 = box2.w * box2.h; + if (area1 < 1e-14 || area2 < 1e-14) { + return 0.f; + } + + const T intersection = rboxes_intersection(box1, box2); + const T iou = intersection / (area1 + area2 - intersection); + return iou; +} + + +// 2D block with 32 * 16 = 512 threads per block +const int BLOCK_DIM_X = 32; +const int BLOCK_DIM_Y = 16; + +/** + Computes ceil(a / b) +*/ +template +__host__ __device__ __forceinline__ T CeilDiv0(T a, T b) { + return (a + b - 1) / b; +} + +static inline int CeilDiv(const int a, const int b) { + return (a + b -1) / b; +} + +template +__global__ void rbox_iou_cuda_kernel( + const int rbox1_num, + const int rbox2_num, + const T* rbox1_data_ptr, + const T* rbox2_data_ptr, + T* output_data_ptr) { + + // get row_start and col_start + const int rbox1_block_idx = blockIdx.x * blockDim.x; + const int rbox2_block_idx = blockIdx.y * blockDim.y; + + const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x); + const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y); + + __shared__ T block_boxes1[BLOCK_DIM_X * 5]; + __shared__ T block_boxes2[BLOCK_DIM_Y * 5]; + + + // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y + if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) { + block_boxes1[threadIdx.x * 5 + 0] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0]; + block_boxes1[threadIdx.x * 5 + 1] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1]; + block_boxes1[threadIdx.x * 5 + 2] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2]; + block_boxes1[threadIdx.x * 5 + 3] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3]; + block_boxes1[threadIdx.x * 5 + 4] = + rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4]; + } + + // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as above: threadIdx.y == 0 + if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) { + block_boxes2[threadIdx.x * 5 + 0] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0]; + block_boxes2[threadIdx.x * 5 + 1] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1]; + block_boxes2[threadIdx.x * 5 + 2] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2]; + block_boxes2[threadIdx.x * 5 + 3] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3]; + block_boxes2[threadIdx.x * 5 + 4] = + rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4]; + } + + // sync + __syncthreads(); + + if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) { + int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + threadIdx.y; + output_data_ptr[offset] = rbox_iou_single(block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); + } +} + +#define CHECK_INPUT_GPU(x) PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") + +std::vector RboxIouCUDAForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { + CHECK_INPUT_GPU(rbox1); + CHECK_INPUT_GPU(rbox2); + + auto rbox1_num = rbox1.shape()[0]; + auto rbox2_num = rbox2.shape()[0]; + + auto output = paddle::Tensor(paddle::PlaceType::kGPU); + output.reshape({rbox1_num, rbox2_num}); + + const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X); + const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y); + + dim3 blocks(blocks_x, blocks_y); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + + PD_DISPATCH_FLOATING_TYPES( + rbox1.type(), + "rbox_iou_cuda_kernel", + ([&] { + rbox_iou_cuda_kernel<<>>( + rbox1_num, + rbox2_num, + rbox1.data(), + rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} + + +template +void rbox_iou_cpu_kernel( + const int rbox1_num, + const int rbox2_num, + const T* rbox1_data_ptr, + const T* rbox2_data_ptr, + T* output_data_ptr) { + + int i, j; + for (i = 0; i < rbox1_num; i++) { + for (j = 0; j < rbox2_num; j++) { + int offset = i * rbox2_num + j; + output_data_ptr[offset] = rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5); + } + } +} + + +#define CHECK_INPUT_CPU(x) PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +std::vector RboxIouCPUForward(const paddle::Tensor& rbox1, const paddle::Tensor& rbox2) { + CHECK_INPUT_CPU(rbox1); + CHECK_INPUT_CPU(rbox2); + + auto rbox1_num = rbox1.shape()[0]; + auto rbox2_num = rbox2.shape()[0]; + + auto output = paddle::Tensor(paddle::PlaceType::kCPU); + output.reshape({rbox1_num, rbox2_num}); + + PD_DISPATCH_FLOATING_TYPES( + rbox1.type(), + "rbox_iou_cpu_kernel", + ([&] { + rbox_iou_cpu_kernel( + rbox1_num, + rbox2_num, + rbox1.data(), + rbox2.data(), + output.mutable_data()); + })); + + return {output}; +} diff --git a/ppdet/ext_op/setup.py b/ppdet/ext_op/setup.py new file mode 100644 index 0000000..6859f0c --- /dev/null +++ b/ppdet/ext_op/setup.py @@ -0,0 +1,6 @@ +from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup + +if __name__ == "__main__": + setup( + name='rbox_iou_ops', + ext_modules=CUDAExtension(sources=['rbox_iou_op.cc', 'rbox_iou_op.cu'])) diff --git a/ppdet/ext_op/test.py b/ppdet/ext_op/test.py new file mode 100644 index 0000000..83403ed --- /dev/null +++ b/ppdet/ext_op/test.py @@ -0,0 +1,154 @@ +import numpy as np +import os +import sys +import cv2 +import time +import shapely +from shapely.geometry import Polygon +import paddle + +paddle.set_device('gpu:0') +paddle.disable_static() + +try: + from rbox_iou_ops import rbox_iou +except Exception as e: + print('import custom_ops error', e) + sys.exit(-1) + +# generate random data +rbox1 = np.random.rand(13000, 5) +rbox2 = np.random.rand(7, 5) + +# x1 y1 w h [0, 0.5] +rbox1[:, 0:4] = rbox1[:, 0:4] * 0.45 + 0.001 +rbox2[:, 0:4] = rbox2[:, 0:4] * 0.45 + 0.001 + +# generate rbox +rbox1[:, 4] = rbox1[:, 4] - 0.5 +rbox2[:, 4] = rbox2[:, 4] - 0.5 + +print('rbox1', rbox1.shape, 'rbox2', rbox2.shape) + +# to paddle tensor +pd_rbox1 = paddle.to_tensor(rbox1) +pd_rbox2 = paddle.to_tensor(rbox2) + +iou = rbox_iou(pd_rbox1, pd_rbox2) +start_time = time.time() +print('paddle time:', time.time() - start_time) +print('iou is', iou.cpu().shape) + + +# get gt +def rbox2poly_single(rrect, get_best_begin_point=False): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + # rect 2x4 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + # poly + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + return poly + + +def intersection(g, p): + """ + Intersection. + """ + + g = g[:8].reshape((4, 2)) + p = p[:8].reshape((4, 2)) + + a = g + b = p + + use_filter = True + if use_filter: + # step1: + inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) + inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) + inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) + inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) + if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: + return 0. + x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) + x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) + y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) + y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) + if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: + return 0. + + g = Polygon(g) + p = Polygon(p) + #g = g.buffer(0) + #p = p.buffer(0) + if not g.is_valid or not p.is_valid: + return 0 + + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +# rbox_iou by python +def rbox_overlaps(anchors, gt_bboxes, use_cv2=False): + """ + + Args: + anchors: [NA, 5] x1,y1,x2,y2,angle + gt_bboxes: [M, 5] x1,y1,x2,y2,angle + + Returns: + + """ + assert anchors.shape[1] == 5 + assert gt_bboxes.shape[1] == 5 + + gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] + anchors_ploy = [rbox2poly_single(e) for e in anchors] + + num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy) + iou = np.zeros((num_gt, num_anchors), dtype=np.float32) + + start_time = time.time() + for i in range(num_gt): + for j in range(num_anchors): + try: + iou[i, j] = intersection(gt_bboxes_ploy[i], anchors_ploy[j]) + except Exception as e: + print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], + 'anchors_ploy[j]', anchors_ploy[j], e) + iou = iou.T + print('intersection all sp_time', time.time() - start_time) + return iou + + +# make coor as int +ploy_rbox1 = rbox1 +ploy_rbox2 = rbox2 +ploy_rbox1[:, 0:4] = rbox1[:, 0:4] * 1024 +ploy_rbox2[:, 0:4] = rbox2[:, 0:4] * 1024 + +start_time = time.time() +iou_py = rbox_overlaps(ploy_rbox1, ploy_rbox2, use_cv2=False) +print('rbox time', time.time() - start_time) +print(iou_py.shape) + +iou_pd = iou.cpu().numpy() +sum_abs_diff = np.sum(np.abs(iou_pd - iou_py)) +print('sum of abs diff', sum_abs_diff) +if sum_abs_diff < 0.02: + print("rbox_iou OP compute right!") diff --git a/ppdet/metrics/__init__.py b/ppdet/metrics/__init__.py new file mode 100644 index 0000000..460b12d --- /dev/null +++ b/ppdet/metrics/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import metrics +from .metrics import * + +__all__ = metrics.__all__ diff --git a/ppdet/metrics/__pycache__/__init__.cpython-38.pyc b/ppdet/metrics/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..1dad3f6 Binary files /dev/null and b/ppdet/metrics/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/__init__.cpython-39.pyc b/ppdet/metrics/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..7028d5a Binary files /dev/null and b/ppdet/metrics/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/metrics/__pycache__/coco_utils.cpython-38.pyc b/ppdet/metrics/__pycache__/coco_utils.cpython-38.pyc new file mode 100644 index 0000000..6a3af65 Binary files /dev/null and b/ppdet/metrics/__pycache__/coco_utils.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/coco_utils.cpython-39.pyc b/ppdet/metrics/__pycache__/coco_utils.cpython-39.pyc new file mode 100644 index 0000000..35a6998 Binary files /dev/null and b/ppdet/metrics/__pycache__/coco_utils.cpython-39.pyc differ diff --git a/ppdet/metrics/__pycache__/json_results.cpython-38.pyc b/ppdet/metrics/__pycache__/json_results.cpython-38.pyc new file mode 100644 index 0000000..8777d0e Binary files /dev/null and b/ppdet/metrics/__pycache__/json_results.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/json_results.cpython-39.pyc b/ppdet/metrics/__pycache__/json_results.cpython-39.pyc new file mode 100644 index 0000000..d55a313 Binary files /dev/null and b/ppdet/metrics/__pycache__/json_results.cpython-39.pyc differ diff --git a/ppdet/metrics/__pycache__/map_utils.cpython-38.pyc b/ppdet/metrics/__pycache__/map_utils.cpython-38.pyc new file mode 100644 index 0000000..0afc391 Binary files /dev/null and b/ppdet/metrics/__pycache__/map_utils.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/map_utils.cpython-39.pyc b/ppdet/metrics/__pycache__/map_utils.cpython-39.pyc new file mode 100644 index 0000000..34254e8 Binary files /dev/null and b/ppdet/metrics/__pycache__/map_utils.cpython-39.pyc differ diff --git a/ppdet/metrics/__pycache__/metrics.cpython-38.pyc b/ppdet/metrics/__pycache__/metrics.cpython-38.pyc new file mode 100644 index 0000000..2eacf81 Binary files /dev/null and b/ppdet/metrics/__pycache__/metrics.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/metrics.cpython-39.pyc b/ppdet/metrics/__pycache__/metrics.cpython-39.pyc new file mode 100644 index 0000000..be8c715 Binary files /dev/null and b/ppdet/metrics/__pycache__/metrics.cpython-39.pyc differ diff --git a/ppdet/metrics/__pycache__/widerface_utils.cpython-38.pyc b/ppdet/metrics/__pycache__/widerface_utils.cpython-38.pyc new file mode 100644 index 0000000..dcacb84 Binary files /dev/null and b/ppdet/metrics/__pycache__/widerface_utils.cpython-38.pyc differ diff --git a/ppdet/metrics/__pycache__/widerface_utils.cpython-39.pyc b/ppdet/metrics/__pycache__/widerface_utils.cpython-39.pyc new file mode 100644 index 0000000..40f6dbe Binary files /dev/null and b/ppdet/metrics/__pycache__/widerface_utils.cpython-39.pyc differ diff --git a/ppdet/metrics/coco_utils.py b/ppdet/metrics/coco_utils.py new file mode 100644 index 0000000..a7ac322 --- /dev/null +++ b/ppdet/metrics/coco_utils.py @@ -0,0 +1,168 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy as np +import itertools + +from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res +from ppdet.metrics.map_utils import draw_pr_curve + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +def get_infer_results(outs, catid, bias=0): + """ + Get result at the stage of inference. + The output format is dictionary containing bbox or mask result. + + For example, bbox result is a list and each element contains + image_id, category_id, bbox and score. + """ + if outs is None or len(outs) == 0: + raise ValueError( + 'The number of valid detection result if zero. Please use reasonable model and check input data.' + ) + + im_id = outs['im_id'] + + infer_res = {} + if 'bbox' in outs: + if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6: + infer_res['bbox'] = get_det_poly_res( + outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) + else: + infer_res['bbox'] = get_det_res( + outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) + + if 'mask' in outs: + # mask post process + infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'], + outs['bbox_num'], im_id, catid) + + if 'segm' in outs: + infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid) + + return infer_res + + +def cocoapi_eval(jsonfile, + style, + coco_gt=None, + anno_file=None, + max_dets=(100, 300, 1000), + classwise=False): + """ + Args: + jsonfile (str): Evaluation json file, eg: bbox.json, mask.json. + style (str): COCOeval style, can be `bbox` , `segm` and `proposal`. + coco_gt (str): Whether to load COCOAPI through anno_file, + eg: coco_gt = COCO(anno_file) + anno_file (str): COCO annotations file. + max_dets (tuple): COCO evaluation maxDets. + classwise (bool): Whether per-category AP and draw P-R Curve or not. + """ + assert coco_gt != None or anno_file != None + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + if coco_gt == None: + coco_gt = COCO(anno_file) + logger.info("Start evaluate...") + coco_dt = coco_gt.loadRes(jsonfile) + if style == 'proposal': + coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') + coco_eval.params.useCats = 0 + coco_eval.params.maxDets = list(max_dets) + else: + coco_eval = COCOeval(coco_gt, coco_dt, style) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if classwise: + # Compute per-category AP and PR curve + try: + from terminaltables import AsciiTable + except Exception as e: + logger.error( + 'terminaltables not found, plaese install terminaltables. ' + 'for example: `pip install terminaltables`.') + raise e + precisions = coco_eval.eval['precision'] + cat_ids = coco_gt.getCatIds() + # precision: (iou, recall, cls, area range, max dets) + assert len(cat_ids) == precisions.shape[2] + results_per_category = [] + for idx, catId in enumerate(cat_ids): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + nm = coco_gt.loadCats(catId)[0] + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + results_per_category.append( + (str(nm["name"]), '{:0.3f}'.format(float(ap)))) + pr_array = precisions[0, :, idx, 0, 2] + recall_array = np.arange(0.0, 1.01, 0.01) + draw_pr_curve( + pr_array, + recall_array, + out_dir=style + '_pr_curve', + file_name='{}_precision_recall_curve.jpg'.format(nm["name"])) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest( + *[results_flatten[i::num_columns] for i in range(num_columns)]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('Per-category of {} AP: \n{}'.format(style, table.table)) + logger.info("per-category PR curve has output to {} folder.".format( + style + '_pr_curve')) + # flush coco evaluation result + sys.stdout.flush() + return coco_eval.stats + + +def json_eval_results(metric, json_directory, dataset): + """ + cocoapi eval with already exists proposal.json, bbox.json or mask.json + """ + assert metric == 'COCO' + anno_file = dataset.get_anno() + json_file_list = ['proposal.json', 'bbox.json', 'mask.json'] + if json_directory: + assert os.path.exists( + json_directory), "The json directory:{} does not exist".format( + json_directory) + for k, v in enumerate(json_file_list): + json_file_list[k] = os.path.join(str(json_directory), v) + + coco_eval_style = ['proposal', 'bbox', 'segm'] + for i, v_json in enumerate(json_file_list): + if os.path.exists(v_json): + cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file) + else: + logger.info("{} not exists!".format(v_json)) diff --git a/ppdet/metrics/json_results.py b/ppdet/metrics/json_results.py new file mode 100644 index 0000000..f560766 --- /dev/null +++ b/ppdet/metrics/json_results.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import six +import os +import numpy as np +import cv2 + + +def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): + det_res = [] + k = 0 + for i in range(len(bbox_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = bbox_nums[i] + for j in range(det_nums): + dt = bboxes[k] + k = k + 1 + num_id, score, xmin, ymin, xmax, ymax = dt.tolist() + if int(num_id) < 0: + continue + category_id = label_to_cat_id_map[int(num_id)] + w = xmax - xmin + bias + h = ymax - ymin + bias + bbox = [xmin, ymin, w, h] + dt_res = { + 'image_id': cur_image_id, + 'category_id': category_id, + 'bbox': bbox, + 'score': score + } + det_res.append(dt_res) + return det_res + + +def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): + det_res = [] + k = 0 + for i in range(len(bbox_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = bbox_nums[i] + for j in range(det_nums): + dt = bboxes[k] + k = k + 1 + num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist() + if int(num_id) < 0: + continue + category_id = int(num_id) + rbox = [x1, y1, x2, y2, x3, y3, x4, y4] + dt_res = { + 'image_id': cur_image_id, + 'category_id': category_id, + 'bbox': rbox, + 'score': score + } + det_res.append(dt_res) + return det_res + + +def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): + det_res = [] + k = 0 + for i in range(len(bbox_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = bbox_nums[i] + for j in range(det_nums): + dt = bboxes[k] + k = k + 1 + num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist() + if int(num_id) < 0: + continue + category_id = int(num_id) + rbox = [x1, y1, x2, y2, x3, y3, x4, y4] + dt_res = { + 'image_id': cur_image_id, + 'category_id': category_id, + 'bbox': rbox, + 'score': score + } + det_res.append(dt_res) + return det_res + + +def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): + import pycocotools.mask as mask_util + seg_res = [] + k = 0 + for i in range(len(mask_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = mask_nums[i] + for j in range(det_nums): + mask = masks[k].astype(np.uint8) + score = float(bboxes[k][1]) + label = int(bboxes[k][0]) + k = k + 1 + if label == -1: + continue + cat_id = label_to_cat_id_map[label] + rle = mask_util.encode( + np.array( + mask[:, :, None], order="F", dtype="uint8"))[0] + if six.PY3: + if 'counts' in rle: + rle['counts'] = rle['counts'].decode("utf8") + sg_res = { + 'image_id': cur_image_id, + 'category_id': cat_id, + 'segmentation': rle, + 'score': score + } + seg_res.append(sg_res) + return seg_res + + +def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map): + import pycocotools.mask as mask_util + segm_res = [] + # for each batch + segms = results['segm'].astype(np.uint8) + clsid_labels = results['cate_label'] + clsid_scores = results['cate_score'] + lengths = segms.shape[0] + im_id = int(image_id[0][0]) + if lengths == 0 or segms is None: + return None + # for each sample + for i in range(lengths - 1): + clsid = int(clsid_labels[i]) + catid = num_id_to_cat_id_map[clsid] + score = float(clsid_scores[i]) + mask = segms[i] + segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] + segm['counts'] = segm['counts'].decode('utf8') + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'segmentation': segm, + 'score': score + } + segm_res.append(coco_res) + return segm_res diff --git a/ppdet/metrics/map_utils.py b/ppdet/metrics/map_utils.py new file mode 100644 index 0000000..17730bc --- /dev/null +++ b/ppdet/metrics/map_utils.py @@ -0,0 +1,306 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np +import itertools + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'draw_pr_curve', 'bbox_area', 'jaccard_overlap', 'prune_zero_padding', + 'DetectionMAP' +] + + +def draw_pr_curve(precision, + recall, + iou=0.5, + out_dir='pr_curve', + file_name='precision_recall_curve.jpg'): + if not os.path.exists(out_dir): + os.makedirs(out_dir) + output_path = os.path.join(out_dir, file_name) + try: + import matplotlib.pyplot as plt + except Exception as e: + logger.error('Matplotlib not found, plaese install matplotlib.' + 'for example: `pip install matplotlib`.') + raise e + plt.cla() + plt.figure('P-R Curve') + plt.title('Precision/Recall Curve(IoU={})'.format(iou)) + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.grid(True) + plt.plot(recall, precision) + plt.savefig(output_path) + + +def bbox_area(bbox, is_bbox_normalized): + """ + Calculate area of a bounding box + """ + norm = 1. - float(is_bbox_normalized) + width = bbox[2] - bbox[0] + norm + height = bbox[3] - bbox[1] + norm + return width * height + + +def jaccard_overlap(pred, gt, is_bbox_normalized=False): + """ + Calculate jaccard overlap ratio between two bounding box + """ + if pred[0] >= gt[2] or pred[2] <= gt[0] or \ + pred[1] >= gt[3] or pred[3] <= gt[1]: + return 0. + inter_xmin = max(pred[0], gt[0]) + inter_ymin = max(pred[1], gt[1]) + inter_xmax = min(pred[2], gt[2]) + inter_ymax = min(pred[3], gt[3]) + inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax], + is_bbox_normalized) + pred_size = bbox_area(pred, is_bbox_normalized) + gt_size = bbox_area(gt, is_bbox_normalized) + overlap = float(inter_size) / (pred_size + gt_size - inter_size) + return overlap + + +def prune_zero_padding(gt_box, gt_label, difficult=None): + valid_cnt = 0 + for i in range(len(gt_box)): + if gt_box[i, 0] == 0 and gt_box[i, 1] == 0 and \ + gt_box[i, 2] == 0 and gt_box[i, 3] == 0: + break + valid_cnt += 1 + return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt] + if difficult is not None else None) + + +class DetectionMAP(object): + """ + Calculate detection mean average precision. + Currently support two types: 11point and integral + + Args: + class_num (int): The class number. + overlap_thresh (float): The threshold of overlap + ratio between prediction bounding box and + ground truth bounding box for deciding + true/false positive. Default 0.5. + map_type (str): Calculation method of mean average + precision, currently support '11point' and + 'integral'. Default '11point'. + is_bbox_normalized (bool): Whether bounding boxes + is normalized to range[0, 1]. Default False. + evaluate_difficult (bool): Whether to evaluate + difficult bounding boxes. Default False. + catid2name (dict): Mapping between category id and category name. + classwise (bool): Whether per-category AP and draw + P-R Curve or not. + """ + + def __init__(self, + class_num, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False, + catid2name=None, + classwise=False): + self.class_num = class_num + self.overlap_thresh = overlap_thresh + assert map_type in ['11point', 'integral'], \ + "map_type currently only support '11point' "\ + "and 'integral'" + self.map_type = map_type + self.is_bbox_normalized = is_bbox_normalized + self.evaluate_difficult = evaluate_difficult + self.classwise = classwise + self.classes = [] + for cname in catid2name.values(): + self.classes.append(cname) + self.reset() + + def update(self, bbox, score, label, gt_box, gt_label, difficult=None): + """ + Update metric statics from given prediction and ground + truth infomations. + """ + if difficult is None: + difficult = np.zeros_like(gt_label) + + # record class gt count + for gtl, diff in zip(gt_label, difficult): + if self.evaluate_difficult or int(diff) == 0: + self.class_gt_counts[int(np.array(gtl))] += 1 + + # record class score positive + visited = [False] * len(gt_label) + for b, s, l in zip(bbox, score, label): + xmin, ymin, xmax, ymax = b.tolist() + pred = [xmin, ymin, xmax, ymax] + max_idx = -1 + max_overlap = -1.0 + for i, gl in enumerate(gt_label): + if int(gl) == int(l): + overlap = jaccard_overlap(pred, gt_box[i], + self.is_bbox_normalized) + if overlap > max_overlap: + max_overlap = overlap + max_idx = i + + if max_overlap > self.overlap_thresh: + if self.evaluate_difficult or \ + int(np.array(difficult[max_idx])) == 0: + if not visited[max_idx]: + self.class_score_poss[int(l)].append([s, 1.0]) + visited[max_idx] = True + else: + self.class_score_poss[int(l)].append([s, 0.0]) + else: + self.class_score_poss[int(l)].append([s, 0.0]) + + def reset(self): + """ + Reset metric statics + """ + self.class_score_poss = [[] for _ in range(self.class_num)] + self.class_gt_counts = [0] * self.class_num + self.mAP = None + + def accumulate(self): + """ + Accumulate metric results and calculate mAP + """ + mAP = 0. + valid_cnt = 0 + eval_results = [] + for score_pos, count in zip(self.class_score_poss, + self.class_gt_counts): + if count == 0: continue + if len(score_pos) == 0: + valid_cnt += 1 + continue + + accum_tp_list, accum_fp_list = \ + self._get_tp_fp_accum(score_pos) + precision = [] + recall = [] + for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list): + precision.append(float(ac_tp) / (ac_tp + ac_fp)) + recall.append(float(ac_tp) / count) + + one_class_ap = 0.0 + if self.map_type == '11point': + max_precisions = [0.] * 11 + start_idx = len(precision) - 1 + for j in range(10, -1, -1): + for i in range(start_idx, -1, -1): + if recall[i] < float(j) / 10.: + start_idx = i + if j > 0: + max_precisions[j - 1] = max_precisions[j] + break + else: + if max_precisions[j] < precision[i]: + max_precisions[j] = precision[i] + one_class_ap = sum(max_precisions) / 11. + mAP += one_class_ap + valid_cnt += 1 + elif self.map_type == 'integral': + import math + prev_recall = 0. + for i in range(len(precision)): + recall_gap = math.fabs(recall[i] - prev_recall) + if recall_gap > 1e-6: + one_class_ap += precision[i] * recall_gap + prev_recall = recall[i] + mAP += one_class_ap + valid_cnt += 1 + else: + logger.error("Unspported mAP type {}".format(self.map_type)) + sys.exit(1) + eval_results.append({ + 'class': self.classes[valid_cnt - 1], + 'ap': one_class_ap, + 'precision': precision, + 'recall': recall, + }) + self.eval_results = eval_results + self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP + + def get_map(self): + """ + Get mAP result + """ + if self.mAP is None: + logger.error("mAP is not calculated.") + if self.classwise: + # Compute per-category AP and PR curve + try: + from terminaltables import AsciiTable + except Exception as e: + logger.error( + 'terminaltables not found, plaese install terminaltables. ' + 'for example: `pip install terminaltables`.') + raise e + results_per_category = [] + for eval_result in self.eval_results: + results_per_category.append( + (str(eval_result['class']), + '{:0.3f}'.format(float(eval_result['ap'])))) + draw_pr_curve( + eval_result['precision'], + eval_result['recall'], + out_dir='voc_pr_curve', + file_name='{}_precision_recall_curve.jpg'.format( + eval_result['class'])) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest( + *[results_flatten[i::num_columns] for i in range(num_columns)]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('Per-category of VOC AP: \n{}'.format(table.table)) + logger.info( + "per-category PR curve has output to voc_pr_curve folder.") + return self.mAP + + def _get_tp_fp_accum(self, score_pos_list): + """ + Calculate accumulating true/false positive results from + [score, pos] records + """ + sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True) + accum_tp = 0 + accum_fp = 0 + accum_tp_list = [] + accum_fp_list = [] + for (score, pos) in sorted_list: + accum_tp += int(pos) + accum_tp_list.append(accum_tp) + accum_fp += 1 - int(pos) + accum_fp_list.append(accum_fp) + return accum_tp_list, accum_fp_list diff --git a/ppdet/metrics/metrics.py b/ppdet/metrics/metrics.py new file mode 100644 index 0000000..e4ad154 --- /dev/null +++ b/ppdet/metrics/metrics.py @@ -0,0 +1,260 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import json +import paddle +import numpy as np + +from .map_utils import prune_zero_padding, DetectionMAP +from .coco_utils import get_infer_results, cocoapi_eval +from .widerface_utils import face_eval_run +from ppdet.data.source.category import get_categories + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results' +] + + +class Metric(paddle.metric.Metric): + def name(self): + return self.__class__.__name__ + + def reset(self): + pass + + def accumulate(self): + pass + + # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate` + # :metch:`reset`, in ppdet, we also need following 2 methods: + + # abstract method for logging metric results + def log(self): + pass + + # abstract method for getting metric results + def get_results(self): + pass + + +class COCOMetric(Metric): + def __init__(self, anno_file, **kwargs): + assert os.path.isfile(anno_file), \ + "anno_file {} not a file".format(anno_file) + self.anno_file = anno_file + self.clsid2catid = kwargs.get('clsid2catid', None) + if self.clsid2catid is None: + self.clsid2catid, _ = get_categories('COCO', anno_file) + self.classwise = kwargs.get('classwise', False) + self.output_eval = kwargs.get('output_eval', None) + # TODO: bias should be unified + self.bias = kwargs.get('bias', 0) + self.save_prediction_only = kwargs.get('save_prediction_only', False) + self.reset() + + def reset(self): + # only bbox and mask evaluation support currently + self.results = {'bbox': [], 'mask': [], 'segm': []} + self.eval_results = {} + + def update(self, inputs, outputs): + outs = {} + # outputs Tensor -> numpy.ndarray + for k, v in outputs.items(): + outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v + + im_id = inputs['im_id'] + outs['im_id'] = im_id.numpy() if isinstance(im_id, + paddle.Tensor) else im_id + + infer_results = get_infer_results( + outs, self.clsid2catid, bias=self.bias) + self.results['bbox'] += infer_results[ + 'bbox'] if 'bbox' in infer_results else [] + self.results['mask'] += infer_results[ + 'mask'] if 'mask' in infer_results else [] + self.results['segm'] += infer_results[ + 'segm'] if 'segm' in infer_results else [] + + def accumulate(self): + if len(self.results['bbox']) > 0: + output = "bbox.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['bbox'], f) + logger.info('The bbox result is saved to bbox.json.') + + if self.save_prediction_only: + logger.info('The bbox result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + bbox_stats = cocoapi_eval( + output, + 'bbox', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['bbox'] = bbox_stats + sys.stdout.flush() + + if len(self.results['mask']) > 0: + output = "mask.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['mask'], f) + logger.info('The mask result is saved to mask.json.') + + if self.save_prediction_only: + logger.info('The mask result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + seg_stats = cocoapi_eval( + output, + 'segm', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['mask'] = seg_stats + sys.stdout.flush() + + if len(self.results['segm']) > 0: + output = "segm.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['segm'], f) + logger.info('The segm result is saved to segm.json.') + + if self.save_prediction_only: + logger.info('The segm result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + seg_stats = cocoapi_eval( + output, + 'segm', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['mask'] = seg_stats + sys.stdout.flush() + + def log(self): + pass + + def get_results(self): + return self.eval_results + + +class VOCMetric(Metric): + def __init__(self, + label_list, + class_num=20, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False, + classwise=False): + assert os.path.isfile(label_list), \ + "label_list {} not a file".format(label_list) + self.clsid2catid, self.catid2name = get_categories('VOC', label_list) + + self.overlap_thresh = overlap_thresh + self.map_type = map_type + self.evaluate_difficult = evaluate_difficult + self.detection_map = DetectionMAP( + class_num=class_num, + overlap_thresh=overlap_thresh, + map_type=map_type, + is_bbox_normalized=is_bbox_normalized, + evaluate_difficult=evaluate_difficult, + catid2name=self.catid2name, + classwise=classwise) + + self.reset() + + def reset(self): + self.detection_map.reset() + + def update(self, inputs, outputs): + bboxes = outputs['bbox'][:, 2:].numpy() + scores = outputs['bbox'][:, 1].numpy() + labels = outputs['bbox'][:, 0].numpy() + bbox_lengths = outputs['bbox_num'].numpy() + + if bboxes.shape == (1, 1) or bboxes is None: + return + gt_boxes = inputs['gt_bbox'].numpy() + gt_labels = inputs['gt_class'].numpy() + difficults = inputs['difficult'].numpy() if not self.evaluate_difficult \ + else None + + scale_factor = inputs['scale_factor'].numpy( + ) if 'scale_factor' in inputs else np.ones( + (gt_boxes.shape[0], 2)).astype('float32') + + bbox_idx = 0 + for i in range(gt_boxes.shape[0]): + gt_box = gt_boxes[i] + h, w = scale_factor[i] + gt_box = gt_box / np.array([w, h, w, h]) + gt_label = gt_labels[i] + difficult = None if difficults is None \ + else difficults[i] + bbox_num = bbox_lengths[i] + bbox = bboxes[bbox_idx:bbox_idx + bbox_num] + score = scores[bbox_idx:bbox_idx + bbox_num] + label = labels[bbox_idx:bbox_idx + bbox_num] + gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label, + difficult) + self.detection_map.update(bbox, score, label, gt_box, gt_label, + difficult) + bbox_idx += bbox_num + + def accumulate(self): + logger.info("Accumulating evaluatation results...") + self.detection_map.accumulate() + + def log(self): + map_stat = 100. * self.detection_map.get_map() + logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, + self.map_type, map_stat)) + + def get_results(self): + return {'bbox': [self.detection_map.get_map()]} + + +class WiderFaceMetric(Metric): + def __init__(self, image_dir, anno_file, multi_scale=True): + self.image_dir = image_dir + self.anno_file = anno_file + self.multi_scale = multi_scale + self.clsid2catid, self.catid2name = get_categories('widerface') + + def update(self, model): + + face_eval_run( + model, + self.image_dir, + self.anno_file, + pred_dir='output/pred', + eval_mode='widerface', + multi_scale=self.multi_scale) diff --git a/ppdet/metrics/widerface_utils.py b/ppdet/metrics/widerface_utils.py new file mode 100644 index 0000000..2f64bf6 --- /dev/null +++ b/ppdet/metrics/widerface_utils.py @@ -0,0 +1,391 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cv2 +import numpy as np +from collections import OrderedDict + +import paddle + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['face_eval_run', 'lmk2out'] + + +def face_eval_run(model, + image_dir, + gt_file, + pred_dir='output/pred', + eval_mode='widerface', + multi_scale=False): + # load ground truth files + with open(gt_file, 'r') as f: + gt_lines = f.readlines() + imid2path = [] + pos_gt = 0 + while pos_gt < len(gt_lines): + name_gt = gt_lines[pos_gt].strip('\n\t').split()[0] + imid2path.append(name_gt) + pos_gt += 1 + n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0]) + pos_gt += 1 + n_gt + logger.info('The ground truth file load {} images'.format(len(imid2path))) + + dets_dist = OrderedDict() + for iter_id, im_path in enumerate(imid2path): + image_path = os.path.join(image_dir, im_path) + if eval_mode == 'fddb': + image_path += '.jpg' + assert os.path.exists(image_path) + image = cv2.imread(image_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + if multi_scale: + shrink, max_shrink = get_shrink(image.shape[0], image.shape[1]) + det0 = detect_face(model, image, shrink) + det1 = flip_test(model, image, shrink) + [det2, det3] = multi_scale_test(model, image, max_shrink) + det4 = multi_scale_test_pyramid(model, image, max_shrink) + det = np.row_stack((det0, det1, det2, det3, det4)) + dets = bbox_vote(det) + else: + dets = detect_face(model, image, 1) + if eval_mode == 'widerface': + save_widerface_bboxes(image_path, dets, pred_dir) + else: + dets_dist[im_path] = dets + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + if eval_mode == 'fddb': + save_fddb_bboxes(dets_dist, pred_dir) + logger.info("Finish evaluation.") + + +def detect_face(model, image, shrink): + image_shape = [image.shape[0], image.shape[1]] + if shrink != 1: + h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink) + image = cv2.resize(image, (w, h)) + image_shape = [h, w] + + img = face_img_process(image) + image_shape = np.asarray([image_shape]) + scale_factor = np.asarray([[shrink, shrink]]) + data = { + "image": paddle.to_tensor( + img, dtype='float32'), + "im_shape": paddle.to_tensor( + image_shape, dtype='float32'), + "scale_factor": paddle.to_tensor( + scale_factor, dtype='float32') + } + model.eval() + detection = model(data) + detection = detection['bbox'].numpy() + # layout: xmin, ymin, xmax. ymax, score + if np.prod(detection.shape) == 1: + logger.info("No face detected") + return np.array([[0, 0, 0, 0, 0]]) + det_conf = detection[:, 1] + det_xmin = detection[:, 2] + det_ymin = detection[:, 3] + det_xmax = detection[:, 4] + det_ymax = detection[:, 5] + + det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) + return det + + +def flip_test(model, image, shrink): + img = cv2.flip(image, 1) + det_f = detect_face(model, img, shrink) + det_t = np.zeros(det_f.shape) + img_width = image.shape[1] + det_t[:, 0] = img_width - det_f[:, 2] + det_t[:, 1] = det_f[:, 1] + det_t[:, 2] = img_width - det_f[:, 0] + det_t[:, 3] = det_f[:, 3] + det_t[:, 4] = det_f[:, 4] + return det_t + + +def multi_scale_test(model, image, max_shrink): + # Shrink detecting is only used to detect big faces + st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink + det_s = detect_face(model, image, st) + index = np.where( + np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) + > 30)[0] + det_s = det_s[index, :] + # Enlarge one times + bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2 + det_b = detect_face(model, image, bt) + + # Enlarge small image x times for small faces + if max_shrink > 2: + bt *= 2 + while bt < max_shrink: + det_b = np.row_stack((det_b, detect_face(model, image, bt))) + bt *= 2 + det_b = np.row_stack((det_b, detect_face(model, image, max_shrink))) + + # Enlarged images are only used to detect small faces. + if bt > 1: + index = np.where( + np.minimum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) < 100)[0] + det_b = det_b[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] + det_b = det_b[index, :] + return det_s, det_b + + +def multi_scale_test_pyramid(model, image, max_shrink): + # Use image pyramids to detect faces + det_b = detect_face(model, image, 0.25) + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) + > 30)[0] + det_b = det_b[index, :] + + st = [0.75, 1.25, 1.5, 1.75] + for i in range(len(st)): + if st[i] <= max_shrink: + det_temp = detect_face(model, image, st[i]) + # Enlarged images are only used to detect small faces. + if st[i] > 1: + index = np.where( + np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0] + det_temp = det_temp[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0] + det_temp = det_temp[index, :] + det_b = np.row_stack((det_b, det_temp)) + return det_b + + +def to_chw(image): + """ + Transpose image from HWC to CHW. + Args: + image (np.array): an image with HWC layout. + """ + # HWC to CHW + if len(image.shape) == 3: + image = np.swapaxes(image, 1, 2) + image = np.swapaxes(image, 1, 0) + return image + + +def face_img_process(image, + mean=[104., 117., 123.], + std=[127.502231, 127.502231, 127.502231]): + img = np.array(image) + img = to_chw(img) + img = img.astype('float32') + img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32') + img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32') + img = [img] + img = np.array(img) + return img + + +def get_shrink(height, width): + """ + Args: + height (int): image height. + width (int): image width. + """ + # avoid out of memory + max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5 + max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5 + + def get_round(x, loc): + str_x = str(x) + if '.' in str_x: + str_before, str_after = str_x.split('.') + len_after = len(str_after) + if len_after >= 3: + str_final = str_before + '.' + str_after[0:loc] + return float(str_final) + else: + return x + + max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3 + if max_shrink >= 1.5 and max_shrink < 2: + max_shrink = max_shrink - 0.1 + elif max_shrink >= 2 and max_shrink < 3: + max_shrink = max_shrink - 0.2 + elif max_shrink >= 3 and max_shrink < 4: + max_shrink = max_shrink - 0.3 + elif max_shrink >= 4 and max_shrink < 5: + max_shrink = max_shrink - 0.4 + elif max_shrink >= 5: + max_shrink = max_shrink - 0.5 + elif max_shrink <= 0.1: + max_shrink = 0.1 + + shrink = max_shrink if max_shrink < 1 else 1 + return shrink, max_shrink + + +def bbox_vote(det): + order = det[:, 4].ravel().argsort()[::-1] + det = det[order, :] + if det.shape[0] == 0: + dets = np.array([[10, 10, 20, 20, 0.002]]) + det = np.empty(shape=[0, 5]) + while det.shape[0] > 0: + # IOU + area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1) + xx1 = np.maximum(det[0, 0], det[:, 0]) + yy1 = np.maximum(det[0, 1], det[:, 1]) + xx2 = np.minimum(det[0, 2], det[:, 2]) + yy2 = np.minimum(det[0, 3], det[:, 3]) + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + o = inter / (area[0] + area[:] - inter) + + # nms + merge_index = np.where(o >= 0.3)[0] + det_accu = det[merge_index, :] + det = np.delete(det, merge_index, 0) + if merge_index.shape[0] <= 1: + if det.shape[0] == 0: + try: + dets = np.row_stack((dets, det_accu)) + except: + dets = det_accu + continue + det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4)) + max_score = np.max(det_accu[:, 4]) + det_accu_sum = np.zeros((1, 5)) + det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], + axis=0) / np.sum(det_accu[:, -1:]) + det_accu_sum[:, 4] = max_score + try: + dets = np.row_stack((dets, det_accu_sum)) + except: + dets = det_accu_sum + dets = dets[0:750, :] + keep_index = np.where(dets[:, 4] >= 0.01)[0] + dets = dets[keep_index, :] + return dets + + +def save_widerface_bboxes(image_path, bboxes_scores, output_dir): + image_name = image_path.split('/')[-1] + image_class = image_path.split('/')[-2] + odir = os.path.join(output_dir, image_class) + if not os.path.exists(odir): + os.makedirs(odir) + + ofname = os.path.join(odir, '%s.txt' % (image_name[:-4])) + f = open(ofname, 'w') + f.write('{:s}\n'.format(image_class + '/' + image_name)) + f.write('{:d}\n'.format(bboxes_scores.shape[0])) + for box_score in bboxes_scores: + xmin, ymin, xmax, ymax, score = box_score + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, ( + xmax - xmin + 1), (ymax - ymin + 1), score)) + f.close() + logger.info("The predicted result is saved as {}".format(ofname)) + + +def save_fddb_bboxes(bboxes_scores, + output_dir, + output_fname='pred_fddb_res.txt'): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + predict_file = os.path.join(output_dir, output_fname) + f = open(predict_file, 'w') + for image_path, dets in bboxes_scores.iteritems(): + f.write('{:s}\n'.format(image_path)) + f.write('{:d}\n'.format(dets.shape[0])) + for box_score in dets: + xmin, ymin, xmax, ymax, score = box_score + width, height = xmax - xmin, ymax - ymin + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n' + .format(xmin, ymin, width, height, score)) + logger.info("The predicted result is saved as {}".format(predict_file)) + return predict_file + + +def lmk2out(results, is_bbox_normalized=False): + """ + Args: + results: request a dict, should include: `landmark`, `im_id`, + if is_bbox_normalized=True, also need `im_shape`. + is_bbox_normalized: whether or not landmark is normalized. + """ + xywh_res = [] + for t in results: + bboxes = t['bbox'][0] + lengths = t['bbox'][1][0] + im_ids = np.array(t['im_id'][0]).flatten() + if bboxes.shape == (1, 1) or bboxes is None: + continue + face_index = t['face_index'][0] + prior_box = t['prior_boxes'][0] + predict_lmk = t['landmark'][0] + prior = np.reshape(prior_box, (-1, 4)) + predictlmk = np.reshape(predict_lmk, (-1, 10)) + + k = 0 + for a in range(len(lengths)): + num = lengths[a] + im_id = int(im_ids[a]) + for i in range(num): + score = bboxes[k][1] + theindex = face_index[i][0] + me_prior = prior[theindex, :] + lmk_pred = predictlmk[theindex, :] + prior_w = me_prior[2] - me_prior[0] + prior_h = me_prior[3] - me_prior[1] + prior_w_center = (me_prior[2] + me_prior[0]) / 2 + prior_h_center = (me_prior[3] + me_prior[1]) / 2 + lmk_decode = np.zeros((10)) + for j in [0, 2, 4, 6, 8]: + lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center + for j in [1, 3, 5, 7, 9]: + lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center + im_shape = t['im_shape'][0][a].tolist() + image_h, image_w = int(im_shape[0]), int(im_shape[1]) + if is_bbox_normalized: + lmk_decode = lmk_decode * np.array([ + image_w, image_h, image_w, image_h, image_w, image_h, + image_w, image_h, image_w, image_h + ]) + lmk_res = { + 'image_id': im_id, + 'landmark': lmk_decode, + 'score': score, + } + xywh_res.append(lmk_res) + k += 1 + return xywh_res diff --git a/ppdet/model_zoo/.gitignore b/ppdet/model_zoo/.gitignore new file mode 100644 index 0000000..f296851 --- /dev/null +++ b/ppdet/model_zoo/.gitignore @@ -0,0 +1 @@ +MODEL_ZOO diff --git a/ppdet/model_zoo/__init__.py b/ppdet/model_zoo/__init__.py new file mode 100644 index 0000000..6db6eb6 --- /dev/null +++ b/ppdet/model_zoo/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import model_zoo +from .model_zoo import * + +__all__ = model_zoo.__all__ diff --git a/ppdet/model_zoo/__pycache__/__init__.cpython-38.pyc b/ppdet/model_zoo/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..b683d90 Binary files /dev/null and b/ppdet/model_zoo/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/model_zoo/__pycache__/__init__.cpython-39.pyc b/ppdet/model_zoo/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..dd44d53 Binary files /dev/null and b/ppdet/model_zoo/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/model_zoo/__pycache__/model_zoo.cpython-38.pyc b/ppdet/model_zoo/__pycache__/model_zoo.cpython-38.pyc new file mode 100644 index 0000000..7a381d0 Binary files /dev/null and b/ppdet/model_zoo/__pycache__/model_zoo.cpython-38.pyc differ diff --git a/ppdet/model_zoo/__pycache__/model_zoo.cpython-39.pyc b/ppdet/model_zoo/__pycache__/model_zoo.cpython-39.pyc new file mode 100644 index 0000000..e7a6078 Binary files /dev/null and b/ppdet/model_zoo/__pycache__/model_zoo.cpython-39.pyc differ diff --git a/ppdet/model_zoo/model_zoo.py b/ppdet/model_zoo/model_zoo.py new file mode 100644 index 0000000..17af46f --- /dev/null +++ b/ppdet/model_zoo/model_zoo.py @@ -0,0 +1,86 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp +import glob +import pkg_resources + +try: + from collections.abc import Sequence +except: + from collections import Sequence + +from ppdet.core.workspace import load_config, create +from ppdet.utils.checkpoint import load_weight +from ppdet.utils.download import get_config_path + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'list_model', 'get_config_file', 'get_weights_url', 'get_model', + 'MODEL_ZOO_FILENAME' +] + +MODEL_ZOO_FILENAME = 'MODEL_ZOO' + + +def list_model(filters=[]): + model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo', + MODEL_ZOO_FILENAME) + with open(model_zoo_file) as f: + model_names = f.read().splitlines() + + # filter model_name + def filt(name): + for f in filters: + if name.find(f) < 0: + return False + return True + + if isinstance(filters, str) or not isinstance(filters, Sequence): + filters = [filters] + model_names = [name for name in model_names if filt(name)] + if len(model_names) == 0 and len(filters) > 0: + raise ValueError("no model found, please check filters seeting, " + "filters can be set as following kinds:\n" + "\tDataset: coco, voc ...\n" + "\tArchitecture: yolo, rcnn, ssd ...\n" + "\tBackbone: resnet, vgg, darknet ...\n") + + model_str = "Available Models:\n" + for model_name in model_names: + model_str += "\t{}\n".format(model_name) + logger.info(model_str) + + +# models and configs save on bcebos under dygraph directory +def get_config_file(model_name): + return get_config_path("ppdet://configs/{}.yml".format(model_name)) + + +def get_weights_url(model_name): + return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1]) + + +def get_model(model_name, pretrained=True): + cfg_file = get_config_file(model_name) + cfg = load_config(cfg_file) + model = create(cfg.architecture) + + if pretrained: + load_weight(model, get_weights_url(model_name)) + + return model diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py new file mode 100644 index 0000000..01968ba --- /dev/null +++ b/ppdet/modeling/__init__.py @@ -0,0 +1,25 @@ +# OP docs may contains math formula which may cause +# DeprecationWarning in string parsing +import warnings +warnings.filterwarnings( + action='ignore', category=DeprecationWarning, module='ops') + +from . import ops +from . import backbones +from . import necks +from . import proposal_generator +from . import heads +from . import losses +from . import architectures +from . import post_process +from . import layers + +from .ops import * +from .backbones import * +from .necks import * +from .proposal_generator import * +from .heads import * +from .losses import * +from .architectures import * +from .post_process import * +from .layers import * diff --git a/ppdet/modeling/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..07f2778 Binary files /dev/null and b/ppdet/modeling/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..fddd92c Binary files /dev/null and b/ppdet/modeling/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/__pycache__/bbox_utils.cpython-38.pyc b/ppdet/modeling/__pycache__/bbox_utils.cpython-38.pyc new file mode 100644 index 0000000..a10e94a Binary files /dev/null and b/ppdet/modeling/__pycache__/bbox_utils.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/bbox_utils.cpython-39.pyc b/ppdet/modeling/__pycache__/bbox_utils.cpython-39.pyc new file mode 100644 index 0000000..cb618d8 Binary files /dev/null and b/ppdet/modeling/__pycache__/bbox_utils.cpython-39.pyc differ diff --git a/ppdet/modeling/__pycache__/layers.cpython-38.pyc b/ppdet/modeling/__pycache__/layers.cpython-38.pyc new file mode 100644 index 0000000..49018c3 Binary files /dev/null and b/ppdet/modeling/__pycache__/layers.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/layers.cpython-39.pyc b/ppdet/modeling/__pycache__/layers.cpython-39.pyc new file mode 100644 index 0000000..ac70caf Binary files /dev/null and b/ppdet/modeling/__pycache__/layers.cpython-39.pyc differ diff --git a/ppdet/modeling/__pycache__/ops.cpython-38.pyc b/ppdet/modeling/__pycache__/ops.cpython-38.pyc new file mode 100644 index 0000000..0838c1c Binary files /dev/null and b/ppdet/modeling/__pycache__/ops.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/ops.cpython-39.pyc b/ppdet/modeling/__pycache__/ops.cpython-39.pyc new file mode 100644 index 0000000..74d63b1 Binary files /dev/null and b/ppdet/modeling/__pycache__/ops.cpython-39.pyc differ diff --git a/ppdet/modeling/__pycache__/post_process.cpython-38.pyc b/ppdet/modeling/__pycache__/post_process.cpython-38.pyc new file mode 100644 index 0000000..82793e1 Binary files /dev/null and b/ppdet/modeling/__pycache__/post_process.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/post_process.cpython-39.pyc b/ppdet/modeling/__pycache__/post_process.cpython-39.pyc new file mode 100644 index 0000000..d15194b Binary files /dev/null and b/ppdet/modeling/__pycache__/post_process.cpython-39.pyc differ diff --git a/ppdet/modeling/__pycache__/shape_spec.cpython-38.pyc b/ppdet/modeling/__pycache__/shape_spec.cpython-38.pyc new file mode 100644 index 0000000..b22919f Binary files /dev/null and b/ppdet/modeling/__pycache__/shape_spec.cpython-38.pyc differ diff --git a/ppdet/modeling/__pycache__/shape_spec.cpython-39.pyc b/ppdet/modeling/__pycache__/shape_spec.cpython-39.pyc new file mode 100644 index 0000000..08e6f9d Binary files /dev/null and b/ppdet/modeling/__pycache__/shape_spec.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py new file mode 100644 index 0000000..ae88160 --- /dev/null +++ b/ppdet/modeling/architectures/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +from . import meta_arch +from . import faster_rcnn +from . import mask_rcnn +from . import yolo +from . import cascade_rcnn +from . import ssd +from . import fcos +from . import solov2 +from . import ttfnet +from . import s2anet + +from .meta_arch import * +from .faster_rcnn import * +from .mask_rcnn import * +from .yolo import * +from .cascade_rcnn import * +from .ssd import * +from .fcos import * +from .solov2 import * +from .ttfnet import * +from .s2anet import * diff --git a/ppdet/modeling/architectures/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..f913481 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..0655ab3 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-38.pyc new file mode 100644 index 0000000..98b640a Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-39.pyc new file mode 100644 index 0000000..388d787 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/cascade_rcnn.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-38.pyc new file mode 100644 index 0000000..2be034b Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-39.pyc new file mode 100644 index 0000000..97a65e6 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/faster_rcnn.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/fcos.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/fcos.cpython-38.pyc new file mode 100644 index 0000000..a68c908 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/fcos.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/fcos.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/fcos.cpython-39.pyc new file mode 100644 index 0000000..1df02de Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/fcos.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-38.pyc new file mode 100644 index 0000000..5a62a36 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-39.pyc new file mode 100644 index 0000000..dc1d8b0 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/mask_rcnn.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-38.pyc new file mode 100644 index 0000000..627a687 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-39.pyc new file mode 100644 index 0000000..ac1228d Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/meta_arch.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/s2anet.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/s2anet.cpython-38.pyc new file mode 100644 index 0000000..1c6912f Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/s2anet.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/s2anet.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/s2anet.cpython-39.pyc new file mode 100644 index 0000000..dad12d1 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/s2anet.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/solov2.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/solov2.cpython-38.pyc new file mode 100644 index 0000000..d82dc2d Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/solov2.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/solov2.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/solov2.cpython-39.pyc new file mode 100644 index 0000000..d416ba8 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/solov2.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/ssd.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/ssd.cpython-38.pyc new file mode 100644 index 0000000..92b9210 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/ssd.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/ssd.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/ssd.cpython-39.pyc new file mode 100644 index 0000000..a129627 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/ssd.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-38.pyc new file mode 100644 index 0000000..b6c3a5e Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-39.pyc new file mode 100644 index 0000000..a2b541a Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/ttfnet.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/yolo.cpython-38.pyc b/ppdet/modeling/architectures/__pycache__/yolo.cpython-38.pyc new file mode 100644 index 0000000..70b4613 Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/yolo.cpython-38.pyc differ diff --git a/ppdet/modeling/architectures/__pycache__/yolo.cpython-39.pyc b/ppdet/modeling/architectures/__pycache__/yolo.cpython-39.pyc new file mode 100644 index 0000000..8f2b09f Binary files /dev/null and b/ppdet/modeling/architectures/__pycache__/yolo.cpython-39.pyc differ diff --git a/ppdet/modeling/architectures/cascade_rcnn.py b/ppdet/modeling/architectures/cascade_rcnn.py new file mode 100644 index 0000000..ac29b77 --- /dev/null +++ b/ppdet/modeling/architectures/cascade_rcnn.py @@ -0,0 +1,143 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['CascadeRCNN'] + + +@register +class CascadeRCNN(BaseArch): + """ + Cascade R-CNN network, see https://arxiv.org/abs/1712.00726 + + Args: + backbone (object): backbone instance + rpn_head (object): `RPNHead` instance + bbox_head (object): `BBoxHead` instance + bbox_post_process (object): `BBoxPostProcess` instance + neck (object): 'FPN' instance + mask_head (object): `MaskHead` instance + mask_post_process (object): `MaskPostProcess` instance + """ + __category__ = 'architecture' + __inject__ = [ + 'bbox_post_process', + 'mask_post_process', + ] + + def __init__(self, + backbone, + rpn_head, + bbox_head, + bbox_post_process, + neck=None, + mask_head=None, + mask_post_process=None): + super(CascadeRCNN, self).__init__() + self.backbone = backbone + self.rpn_head = rpn_head + self.bbox_head = bbox_head + self.bbox_post_process = bbox_post_process + self.neck = neck + self.mask_head = mask_head + self.mask_post_process = mask_post_process + self.with_mask = mask_head is not None + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + kwargs = {'input_shape': backbone.out_shape} + neck = cfg['neck'] and create(cfg['neck'], **kwargs) + + out_shape = neck and neck.out_shape or backbone.out_shape + kwargs = {'input_shape': out_shape} + rpn_head = create(cfg['rpn_head'], **kwargs) + bbox_head = create(cfg['bbox_head'], **kwargs) + + out_shape = neck and out_shape or bbox_head.get_head().out_shape + kwargs = {'input_shape': out_shape} + mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs) + return { + 'backbone': backbone, + 'neck': neck, + "rpn_head": rpn_head, + "bbox_head": bbox_head, + "mask_head": mask_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + if self.neck is not None: + body_feats = self.neck(body_feats) + + if self.training: + rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) + bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, + self.inputs) + rois, rois_num = self.bbox_head.get_assigned_rois() + bbox_targets = self.bbox_head.get_assigned_targets() + if self.with_mask: + mask_loss = self.mask_head(body_feats, rois, rois_num, + self.inputs, bbox_targets, bbox_feat) + return rpn_loss, bbox_loss, mask_loss + else: + return rpn_loss, bbox_loss, {} + else: + rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) + preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs) + refined_rois = self.bbox_head.get_refined_rois() + + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + + bbox, bbox_num = self.bbox_post_process( + preds, (refined_rois, rois_num), im_shape, scale_factor) + # rescale the prediction back to origin image + bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, + im_shape, scale_factor) + if not self.with_mask: + return bbox_pred, bbox_num, None + mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs) + origin_shape = self.bbox_post_process.get_origin_shape() + mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred, + bbox_num, origin_shape) + return bbox_pred, bbox_num, mask_pred + + def get_loss(self, ): + rpn_loss, bbox_loss, mask_loss = self._forward() + loss = {} + loss.update(rpn_loss) + loss.update(bbox_loss) + if self.with_mask: + loss.update(mask_loss) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + bbox_pred, bbox_num, mask_pred = self._forward() + output = { + 'bbox': bbox_pred, + 'bbox_num': bbox_num, + } + if self.with_mask: + output.update({'mask': mask_pred}) + return output diff --git a/ppdet/modeling/architectures/faster_rcnn.py b/ppdet/modeling/architectures/faster_rcnn.py new file mode 100644 index 0000000..26a2672 --- /dev/null +++ b/ppdet/modeling/architectures/faster_rcnn.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['FasterRCNN'] + + +@register +class FasterRCNN(BaseArch): + """ + Faster R-CNN network, see https://arxiv.org/abs/1506.01497 + + Args: + backbone (object): backbone instance + rpn_head (object): `RPNHead` instance + bbox_head (object): `BBoxHead` instance + bbox_post_process (object): `BBoxPostProcess` instance + neck (object): 'FPN' instance + """ + __category__ = 'architecture' + __inject__ = ['bbox_post_process'] + + def __init__(self, + backbone, + rpn_head, + bbox_head, + bbox_post_process, + neck=None): + super(FasterRCNN, self).__init__() + self.backbone = backbone + self.neck = neck + self.rpn_head = rpn_head + self.bbox_head = bbox_head + self.bbox_post_process = bbox_post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + kwargs = {'input_shape': backbone.out_shape} + neck = cfg['neck'] and create(cfg['neck'], **kwargs) + + out_shape = neck and neck.out_shape or backbone.out_shape + kwargs = {'input_shape': out_shape} + rpn_head = create(cfg['rpn_head'], **kwargs) + bbox_head = create(cfg['bbox_head'], **kwargs) + return { + 'backbone': backbone, + 'neck': neck, + "rpn_head": rpn_head, + "bbox_head": bbox_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + if self.neck is not None: + body_feats = self.neck(body_feats) + if self.training: + rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) + bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num, + self.inputs) + return rpn_loss, bbox_loss + else: + rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) + preds, _ = self.bbox_head(body_feats, rois, rois_num, None) + + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num), + im_shape, scale_factor) + + # rescale the prediction back to origin image + bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, + im_shape, scale_factor) + return bbox_pred, bbox_num + + def get_loss(self, ): + rpn_loss, bbox_loss = self._forward() + loss = {} + loss.update(rpn_loss) + loss.update(bbox_loss) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + bbox_pred, bbox_num = self._forward() + output = {'bbox': bbox_pred, 'bbox_num': bbox_num} + return output diff --git a/ppdet/modeling/architectures/fcos.py b/ppdet/modeling/architectures/fcos.py new file mode 100644 index 0000000..6f71c89 --- /dev/null +++ b/ppdet/modeling/architectures/fcos.py @@ -0,0 +1,95 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['FCOS'] + + +@register +class FCOS(BaseArch): + __category__ = 'architecture' + __inject__ = ['fcos_post_process'] + + def __init__(self, + backbone, + neck, + fcos_head='FCOSHead', + fcos_post_process='FCOSPostProcess'): + super(FCOS, self).__init__() + self.backbone = backbone + self.neck = neck + self.fcos_head = fcos_head + self.fcos_post_process = fcos_post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + kwargs = {'input_shape': neck.out_shape} + fcos_head = create(cfg['fcos_head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + "fcos_head": fcos_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + fpn_feats = self.neck(body_feats) + fcos_head_outs = self.fcos_head(fpn_feats, self.training) + if not self.training: + scale_factor = self.inputs['scale_factor'] + bboxes = self.fcos_post_process(fcos_head_outs, scale_factor) + return bboxes + else: + return fcos_head_outs + + def get_loss(self, ): + loss = {} + tag_labels, tag_bboxes, tag_centerness = [], [], [] + for i in range(len(self.fcos_head.fpn_stride)): + # reg_target, labels, scores, centerness + k_lbl = 'labels{}'.format(i) + if k_lbl in self.inputs: + tag_labels.append(self.inputs[k_lbl]) + k_box = 'reg_target{}'.format(i) + if k_box in self.inputs: + tag_bboxes.append(self.inputs[k_box]) + k_ctn = 'centerness{}'.format(i) + if k_ctn in self.inputs: + tag_centerness.append(self.inputs[k_ctn]) + + fcos_head_outs = self._forward() + loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels, + tag_bboxes, tag_centerness) + loss.update(loss_fcos) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + bboxes, bbox_num = self._forward() + output = {'bbox': bboxes, 'bbox_num': bbox_num} + return output diff --git a/ppdet/modeling/architectures/mask_rcnn.py b/ppdet/modeling/architectures/mask_rcnn.py new file mode 100644 index 0000000..071a326 --- /dev/null +++ b/ppdet/modeling/architectures/mask_rcnn.py @@ -0,0 +1,135 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['MaskRCNN'] + + +@register +class MaskRCNN(BaseArch): + """ + Mask R-CNN network, see https://arxiv.org/abs/1703.06870 + + Args: + backbone (object): backbone instance + rpn_head (object): `RPNHead` instance + bbox_head (object): `BBoxHead` instance + mask_head (object): `MaskHead` instance + bbox_post_process (object): `BBoxPostProcess` instance + mask_post_process (object): `MaskPostProcess` instance + neck (object): 'FPN' instance + """ + + __category__ = 'architecture' + __inject__ = [ + 'bbox_post_process', + 'mask_post_process', + ] + + def __init__(self, + backbone, + rpn_head, + bbox_head, + mask_head, + bbox_post_process, + mask_post_process, + neck=None): + super(MaskRCNN, self).__init__() + self.backbone = backbone + self.neck = neck + self.rpn_head = rpn_head + self.bbox_head = bbox_head + self.mask_head = mask_head + + self.bbox_post_process = bbox_post_process + self.mask_post_process = mask_post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + kwargs = {'input_shape': backbone.out_shape} + neck = cfg['neck'] and create(cfg['neck'], **kwargs) + + out_shape = neck and neck.out_shape or backbone.out_shape + kwargs = {'input_shape': out_shape} + rpn_head = create(cfg['rpn_head'], **kwargs) + bbox_head = create(cfg['bbox_head'], **kwargs) + + out_shape = neck and out_shape or bbox_head.get_head().out_shape + kwargs = {'input_shape': out_shape} + mask_head = create(cfg['mask_head'], **kwargs) + return { + 'backbone': backbone, + 'neck': neck, + "rpn_head": rpn_head, + "bbox_head": bbox_head, + "mask_head": mask_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + if self.neck is not None: + body_feats = self.neck(body_feats) + + if self.training: + rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) + bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, + self.inputs) + rois, rois_num = self.bbox_head.get_assigned_rois() + bbox_targets = self.bbox_head.get_assigned_targets() + # Mask Head needs bbox_feat in Mask RCNN + mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs, + bbox_targets, bbox_feat) + return rpn_loss, bbox_loss, mask_loss + else: + rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) + preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None) + + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + + bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num), + im_shape, scale_factor) + mask_out = self.mask_head( + body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func) + + # rescale the prediction back to origin image + bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num, + im_shape, scale_factor) + origin_shape = self.bbox_post_process.get_origin_shape() + mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred, + bbox_num, origin_shape) + return bbox_pred, bbox_num, mask_pred + + def get_loss(self, ): + bbox_loss, mask_loss, rpn_loss = self._forward() + loss = {} + loss.update(rpn_loss) + loss.update(bbox_loss) + loss.update(mask_loss) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + bbox_pred, bbox_num, mask_pred = self._forward() + output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred} + return output diff --git a/ppdet/modeling/architectures/meta_arch.py b/ppdet/modeling/architectures/meta_arch.py new file mode 100644 index 0000000..fb2f5bd --- /dev/null +++ b/ppdet/modeling/architectures/meta_arch.py @@ -0,0 +1,45 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +import paddle.nn as nn +from ppdet.core.workspace import register + +__all__ = ['BaseArch'] + + +@register +class BaseArch(nn.Layer): + def __init__(self, data_format='NCHW'): + super(BaseArch, self).__init__() + self.data_format = data_format + + def forward(self, inputs): + if self.data_format == 'NHWC': + image = inputs['image'] + inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) + self.inputs = inputs + self.model_arch() + + if self.training: + out = self.get_loss() + else: + out = self.get_pred() + return out + + def build_inputs(self, data, input_def): + inputs = {} + for i, k in enumerate(input_def): + inputs[k] = data[i] + return inputs + + def model_arch(self, ): + pass + + def get_loss(self, ): + raise NotImplementedError("Should implement get_loss method!") + + def get_pred(self, ): + raise NotImplementedError("Should implement get_pred method!") diff --git a/ppdet/modeling/architectures/s2anet.py b/ppdet/modeling/architectures/s2anet.py new file mode 100644 index 0000000..72e9e82 --- /dev/null +++ b/ppdet/modeling/architectures/s2anet.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch +import numpy as np + +__all__ = ['S2ANet'] + + +@register +class S2ANet(BaseArch): + __category__ = 'architecture' + __inject__ = [ + 's2anet_head', + 's2anet_bbox_post_process', + ] + + def __init__(self, backbone, neck, s2anet_head, s2anet_bbox_post_process): + """ + S2ANet, see https://arxiv.org/pdf/2008.09397.pdf + + Args: + backbone (object): backbone instance + neck (object): `FPN` instance + s2anet_head (object): `S2ANetHead` instance + s2anet_bbox_post_process (object): `S2ANetBBoxPostProcess` instance + """ + super(S2ANet, self).__init__() + self.backbone = backbone + self.neck = neck + self.s2anet_head = s2anet_head + self.s2anet_bbox_post_process = s2anet_bbox_post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + kwargs = {'input_shape': backbone.out_shape} + neck = cfg['neck'] and create(cfg['neck'], **kwargs) + + out_shape = neck and neck.out_shape or backbone.out_shape + kwargs = {'input_shape': out_shape} + s2anet_head = create(cfg['s2anet_head'], **kwargs) + s2anet_bbox_post_process = create(cfg['s2anet_bbox_post_process'], + **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + "s2anet_head": s2anet_head, + "s2anet_bbox_post_process": s2anet_bbox_post_process, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + if self.neck is not None: + body_feats = self.neck(body_feats) + self.s2anet_head(body_feats) + if self.training: + loss = self.s2anet_head.get_loss(self.inputs) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + im_shape = self.inputs['im_shape'] + scale_factor = self.inputs['scale_factor'] + nms_pre = self.s2anet_bbox_post_process.nms_pre + pred_scores, pred_bboxes = self.s2anet_head.get_prediction(nms_pre) + + # post_process + pred_cls_score_bbox, bbox_num, index = self.s2anet_bbox_post_process.get_prediction( + pred_scores, pred_bboxes, im_shape, scale_factor) + + # output + output = {'bbox': pred_cls_score_bbox, 'bbox_num': bbox_num} + return output + + def get_loss(self, ): + loss = self._forward() + return loss + + def get_pred(self): + output = self._forward() + return output diff --git a/ppdet/modeling/architectures/solov2.py b/ppdet/modeling/architectures/solov2.py new file mode 100644 index 0000000..4e5fc21 --- /dev/null +++ b/ppdet/modeling/architectures/solov2.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle + +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['SOLOv2'] + + +@register +class SOLOv2(BaseArch): + """ + SOLOv2 network, see https://arxiv.org/abs/2003.10152 + + Args: + backbone (object): an backbone instance + solov2_head (object): an `SOLOv2Head` instance + mask_head (object): an `SOLOv2MaskHead` instance + neck (object): neck of network, such as feature pyramid network instance + """ + + __category__ = 'architecture' + + def __init__(self, backbone, solov2_head, mask_head, neck=None): + super(SOLOv2, self).__init__() + self.backbone = backbone + self.neck = neck + self.solov2_head = solov2_head + self.mask_head = mask_head + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + kwargs = {'input_shape': neck.out_shape} + solov2_head = create(cfg['solov2_head'], **kwargs) + mask_head = create(cfg['mask_head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + 'solov2_head': solov2_head, + 'mask_head': mask_head, + } + + def model_arch(self): + body_feats = self.backbone(self.inputs) + + body_feats = self.neck(body_feats) + + self.seg_pred = self.mask_head(body_feats) + + self.cate_pred_list, self.kernel_pred_list = self.solov2_head( + body_feats) + + def get_loss(self, ): + loss = {} + # get gt_ins_labels, gt_cate_labels, etc. + gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], [] + fg_num = self.inputs['fg_num'] + for i in range(len(self.solov2_head.seg_num_grids)): + ins_label = 'ins_label{}'.format(i) + if ins_label in self.inputs: + gt_ins_labels.append(self.inputs[ins_label]) + cate_label = 'cate_label{}'.format(i) + if cate_label in self.inputs: + gt_cate_labels.append(self.inputs[cate_label]) + grid_order = 'grid_order{}'.format(i) + if grid_order in self.inputs: + gt_grid_orders.append(self.inputs[grid_order]) + + loss_solov2 = self.solov2_head.get_loss( + self.cate_pred_list, self.kernel_pred_list, self.seg_pred, + gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num) + loss.update(loss_solov2) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction( + self.cate_pred_list, self.kernel_pred_list, self.seg_pred, + self.inputs['im_shape'], self.inputs['scale_factor']) + outs = { + "segm": seg_masks, + "bbox_num": bbox_num, + 'cate_label': cate_labels, + 'cate_score': cate_scores + } + return outs diff --git a/ppdet/modeling/architectures/ssd.py b/ppdet/modeling/architectures/ssd.py new file mode 100644 index 0000000..136e34f --- /dev/null +++ b/ppdet/modeling/architectures/ssd.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['SSD'] + + +@register +class SSD(BaseArch): + """ + Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325 + + Args: + backbone (nn.Layer): backbone instance + ssd_head (nn.Layer): `SSDHead` instance + post_process (object): `BBoxPostProcess` instance + """ + + __category__ = 'architecture' + __inject__ = ['post_process'] + + def __init__(self, backbone, ssd_head, post_process): + super(SSD, self).__init__() + self.backbone = backbone + self.ssd_head = ssd_head + self.post_process = post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + + # head + kwargs = {'input_shape': backbone.out_shape} + ssd_head = create(cfg['ssd_head'], **kwargs) + + return { + 'backbone': backbone, + "ssd_head": ssd_head, + } + + def _forward(self): + # Backbone + body_feats = self.backbone(self.inputs) + + # SSD Head + if self.training: + return self.ssd_head(body_feats, self.inputs['image'], + self.inputs['gt_bbox'], + self.inputs['gt_class']) + else: + preds, anchors = self.ssd_head(body_feats, self.inputs['image']) + bbox, bbox_num = self.post_process(preds, anchors, + self.inputs['im_shape'], + self.inputs['scale_factor']) + return bbox, bbox_num + + def get_loss(self, ): + return {"loss": self._forward()} + + def get_pred(self): + bbox_pred, bbox_num = self._forward() + output = { + "bbox": bbox_pred, + "bbox_num": bbox_num, + } + return output diff --git a/ppdet/modeling/architectures/ttfnet.py b/ppdet/modeling/architectures/ttfnet.py new file mode 100644 index 0000000..c3eb61c --- /dev/null +++ b/ppdet/modeling/architectures/ttfnet.py @@ -0,0 +1,98 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['TTFNet'] + + +@register +class TTFNet(BaseArch): + """ + TTFNet network, see https://arxiv.org/abs/1909.00700 + + Args: + backbone (object): backbone instance + neck (object): 'TTFFPN' instance + ttf_head (object): 'TTFHead' instance + post_process (object): 'BBoxPostProcess' instance + """ + + __category__ = 'architecture' + __inject__ = ['post_process'] + + def __init__(self, + backbone='DarkNet', + neck='TTFFPN', + ttf_head='TTFHead', + post_process='BBoxPostProcess'): + super(TTFNet, self).__init__() + self.backbone = backbone + self.neck = neck + self.ttf_head = ttf_head + self.post_process = post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + backbone = create(cfg['backbone']) + + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + kwargs = {'input_shape': neck.out_shape} + ttf_head = create(cfg['ttf_head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + "ttf_head": ttf_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + body_feats = self.neck(body_feats) + hm, wh = self.ttf_head(body_feats) + if self.training: + return hm, wh + else: + bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'], + self.inputs['scale_factor']) + return bbox, bbox_num + + def get_loss(self, ): + loss = {} + heatmap = self.inputs['ttf_heatmap'] + box_target = self.inputs['ttf_box_target'] + reg_weight = self.inputs['ttf_reg_weight'] + hm, wh = self._forward() + head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target, + reg_weight) + loss.update(head_loss) + total_loss = paddle.add_n(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + def get_pred(self): + bbox_pred, bbox_num = self._forward() + output = { + "bbox": bbox_pred, + "bbox_num": bbox_num, + } + return output diff --git a/ppdet/modeling/architectures/yolo.py b/ppdet/modeling/architectures/yolo.py new file mode 100644 index 0000000..6c04444 --- /dev/null +++ b/ppdet/modeling/architectures/yolo.py @@ -0,0 +1,77 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ppdet.core.workspace import register, create +from .meta_arch import BaseArch + +__all__ = ['YOLOv3'] + + +@register +class YOLOv3(BaseArch): + __category__ = 'architecture' + __shared__ = ['data_format'] + __inject__ = ['post_process'] + + def __init__(self, + backbone='DarkNet', + neck='YOLOv3FPN', + yolo_head='YOLOv3Head', + post_process='BBoxPostProcess', + data_format='NCHW'): + """ + YOLOv3 network, see https://arxiv.org/abs/1804.02767 + + Args: + backbone (nn.Layer): backbone instance + neck (nn.Layer): neck instance + yolo_head (nn.Layer): anchor_head instance + bbox_post_process (object): `BBoxPostProcess` instance + data_format (str): data format, NCHW or NHWC + """ + super(YOLOv3, self).__init__(data_format=data_format) + self.backbone = backbone + self.neck = neck + self.yolo_head = yolo_head + self.post_process = post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + + # fpn + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) + + # head + kwargs = {'input_shape': neck.out_shape} + yolo_head = create(cfg['yolo_head'], **kwargs) + + return { + 'backbone': backbone, + 'neck': neck, + "yolo_head": yolo_head, + } + + def _forward(self): + body_feats = self.backbone(self.inputs) + body_feats = self.neck(body_feats) + + if self.training: + return self.yolo_head(body_feats, self.inputs) + else: + yolo_head_outs = self.yolo_head(body_feats) + bbox, bbox_num = self.post_process( + yolo_head_outs, self.yolo_head.mask_anchors, + self.inputs['im_shape'], self.inputs['scale_factor']) + return bbox, bbox_num + + def get_loss(self): + return self._forward() + + def get_pred(self): + bbox_pred, bbox_num = self._forward() + output = {'bbox': bbox_pred, 'bbox_num': bbox_num} + return output diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py new file mode 100644 index 0000000..4937c9b --- /dev/null +++ b/ppdet/modeling/backbones/__init__.py @@ -0,0 +1,33 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import vgg +from . import resnet +from . import darknet +from . import mobilenet_v1 +from . import mobilenet_v3 +from . import hrnet +from . import blazenet +from . import ghostnet +from . import senet + +from .vgg import * +from .resnet import * +from .darknet import * +from .mobilenet_v1 import * +from .mobilenet_v3 import * +from .hrnet import * +from .blazenet import * +from .ghostnet import * +from .senet import * diff --git a/ppdet/modeling/backbones/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..30fef8c Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..a823183 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/blazenet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/blazenet.cpython-38.pyc new file mode 100644 index 0000000..ab6a538 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/blazenet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/blazenet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/blazenet.cpython-39.pyc new file mode 100644 index 0000000..319dea1 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/blazenet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/darknet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/darknet.cpython-38.pyc new file mode 100644 index 0000000..be1bfd5 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/darknet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/darknet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/darknet.cpython-39.pyc new file mode 100644 index 0000000..c39ebb8 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/darknet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-38.pyc new file mode 100644 index 0000000..513b098 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-39.pyc new file mode 100644 index 0000000..50bd846 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/ghostnet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/hrnet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/hrnet.cpython-38.pyc new file mode 100644 index 0000000..419c52a Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/hrnet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/hrnet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/hrnet.cpython-39.pyc new file mode 100644 index 0000000..c3e782e Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/hrnet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-38.pyc new file mode 100644 index 0000000..2a67bd6 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-39.pyc new file mode 100644 index 0000000..fd95a6b Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/mobilenet_v1.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-38.pyc new file mode 100644 index 0000000..66d40a9 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-39.pyc new file mode 100644 index 0000000..8b0900d Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/mobilenet_v3.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-38.pyc new file mode 100644 index 0000000..d297ba7 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-39.pyc new file mode 100644 index 0000000..02f07dc Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/name_adapter.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/resnet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/resnet.cpython-38.pyc new file mode 100644 index 0000000..0911204 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/resnet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/resnet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/resnet.cpython-39.pyc new file mode 100644 index 0000000..e5bca7d Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/resnet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/senet.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/senet.cpython-38.pyc new file mode 100644 index 0000000..11c6d6f Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/senet.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/senet.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/senet.cpython-39.pyc new file mode 100644 index 0000000..5594e94 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/senet.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/vgg.cpython-38.pyc b/ppdet/modeling/backbones/__pycache__/vgg.cpython-38.pyc new file mode 100644 index 0000000..d02d4a4 Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/vgg.cpython-38.pyc differ diff --git a/ppdet/modeling/backbones/__pycache__/vgg.cpython-39.pyc b/ppdet/modeling/backbones/__pycache__/vgg.cpython-39.pyc new file mode 100644 index 0000000..25f298e Binary files /dev/null and b/ppdet/modeling/backbones/__pycache__/vgg.cpython-39.pyc differ diff --git a/ppdet/modeling/backbones/blazenet.py b/ppdet/modeling/backbones/blazenet.py new file mode 100644 index 0000000..97134c2 --- /dev/null +++ b/ppdet/modeling/backbones/blazenet.py @@ -0,0 +1,321 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['BlazeNet'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + num_groups=1, + act='relu', + conv_lr=0.1, + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr( + learning_rate=conv_lr, + initializer=KaimingNormal(), + name=name + "_weights"), + bias_attr=False) + + param_attr = ParamAttr(name=name + "_bn_scale") + bias_attr = ParamAttr(name=name + "_bn_offset") + if norm_type == 'sync_bn': + self._batch_norm = nn.SyncBatchNorm( + out_channels, weight_attr=param_attr, bias_attr=bias_attr) + else: + self._batch_norm = nn.BatchNorm( + out_channels, + act=None, + param_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=False, + moving_mean_name=name + '_bn_mean', + moving_variance_name=name + '_bn_variance') + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + if self.act == "relu": + x = F.relu(x) + elif self.act == "relu6": + x = F.relu6(x) + return x + + +class BlazeBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels1, + out_channels2, + double_channels=None, + stride=1, + use_5x5kernel=True, + name=None): + super(BlazeBlock, self).__init__() + assert stride in [1, 2] + self.use_pool = not stride == 1 + self.use_double_block = double_channels is not None + self.conv_dw = [] + if use_5x5kernel: + self.conv_dw.append( + self.add_sublayer( + name + "1_dw", + ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels1, + kernel_size=5, + stride=stride, + padding=2, + num_groups=out_channels1, + name=name + "1_dw"))) + else: + self.conv_dw.append( + self.add_sublayer( + name + "1_dw_1", + ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels1, + kernel_size=3, + stride=1, + padding=1, + num_groups=out_channels1, + name=name + "1_dw_1"))) + self.conv_dw.append( + self.add_sublayer( + name + "1_dw_2", + ConvBNLayer( + in_channels=out_channels1, + out_channels=out_channels1, + kernel_size=3, + stride=stride, + padding=1, + num_groups=out_channels1, + name=name + "1_dw_2"))) + act = 'relu' if self.use_double_block else None + self.conv_pw = ConvBNLayer( + in_channels=out_channels1, + out_channels=out_channels2, + kernel_size=1, + stride=1, + padding=0, + act=act, + name=name + "1_sep") + if self.use_double_block: + self.conv_dw2 = [] + if use_5x5kernel: + self.conv_dw2.append( + self.add_sublayer( + name + "2_dw", + ConvBNLayer( + in_channels=out_channels2, + out_channels=out_channels2, + kernel_size=5, + stride=1, + padding=2, + num_groups=out_channels2, + name=name + "2_dw"))) + else: + self.conv_dw2.append( + self.add_sublayer( + name + "2_dw_1", + ConvBNLayer( + in_channels=out_channels2, + out_channels=out_channels2, + kernel_size=3, + stride=1, + padding=1, + num_groups=out_channels2, + name=name + "1_dw_1"))) + self.conv_dw2.append( + self.add_sublayer( + name + "2_dw_2", + ConvBNLayer( + in_channels=out_channels2, + out_channels=out_channels2, + kernel_size=3, + stride=1, + padding=1, + num_groups=out_channels2, + name=name + "2_dw_2"))) + self.conv_pw2 = ConvBNLayer( + in_channels=out_channels2, + out_channels=double_channels, + kernel_size=1, + stride=1, + padding=0, + name=name + "2_sep") + # shortcut + if self.use_pool: + shortcut_channel = double_channels or out_channels2 + self._shortcut = [] + self._shortcut.append( + self.add_sublayer( + name + '_shortcut_pool', + nn.MaxPool2D( + kernel_size=stride, stride=stride, ceil_mode=True))) + self._shortcut.append( + self.add_sublayer( + name + '_shortcut_conv', + ConvBNLayer( + in_channels=in_channels, + out_channels=shortcut_channel, + kernel_size=1, + stride=1, + padding=0, + name="shortcut" + name))) + + def forward(self, x): + y = x + for conv_dw_block in self.conv_dw: + y = conv_dw_block(y) + y = self.conv_pw(y) + if self.use_double_block: + for conv_dw2_block in self.conv_dw2: + y = conv_dw2_block(y) + y = self.conv_pw2(y) + if self.use_pool: + for shortcut in self._shortcut: + x = shortcut(x) + return F.relu(paddle.add(x, y)) + + +@register +@serializable +class BlazeNet(nn.Layer): + """ + BlazeFace, see https://arxiv.org/abs/1907.05047 + + Args: + blaze_filters (list): number of filter for each blaze block. + double_blaze_filters (list): number of filter for each double_blaze block. + use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv. + """ + + def __init__( + self, + blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]], + double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96], + [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]], + use_5x5kernel=True): + super(BlazeNet, self).__init__() + conv1_num_filters = blaze_filters[0][0] + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=conv1_num_filters, + kernel_size=3, + stride=2, + padding=1, + name="conv1") + in_channels = conv1_num_filters + self.blaze_block = [] + self._out_channels = [] + for k, v in enumerate(blaze_filters): + assert len(v) in [2, 3], \ + "blaze_filters {} not in [2, 3]" + if len(v) == 2: + self.blaze_block.append( + self.add_sublayer( + 'blaze_{}'.format(k), + BlazeBlock( + in_channels, + v[0], + v[1], + use_5x5kernel=use_5x5kernel, + name='blaze_{}'.format(k)))) + elif len(v) == 3: + self.blaze_block.append( + self.add_sublayer( + 'blaze_{}'.format(k), + BlazeBlock( + in_channels, + v[0], + v[1], + stride=v[2], + use_5x5kernel=use_5x5kernel, + name='blaze_{}'.format(k)))) + in_channels = v[1] + + for k, v in enumerate(double_blaze_filters): + assert len(v) in [3, 4], \ + "blaze_filters {} not in [3, 4]" + if len(v) == 3: + self.blaze_block.append( + self.add_sublayer( + 'double_blaze_{}'.format(k), + BlazeBlock( + in_channels, + v[0], + v[1], + double_channels=v[2], + use_5x5kernel=use_5x5kernel, + name='double_blaze_{}'.format(k)))) + elif len(v) == 4: + self.blaze_block.append( + self.add_sublayer( + 'double_blaze_{}'.format(k), + BlazeBlock( + in_channels, + v[0], + v[1], + double_channels=v[2], + stride=v[3], + use_5x5kernel=use_5x5kernel, + name='double_blaze_{}'.format(k)))) + in_channels = v[2] + self._out_channels.append(in_channels) + + def forward(self, inputs): + outs = [] + y = self.conv1(inputs['image']) + for block in self.blaze_block: + y = block(y) + outs.append(y) + return [outs[-4], outs[-1]] + + @property + def out_shape(self): + return [ + ShapeSpec(channels=c) + for c in [self._out_channels[-4], self._out_channels[-1]] + ] diff --git a/ppdet/modeling/backbones/darknet.py b/ppdet/modeling/backbones/darknet.py new file mode 100644 index 0000000..8d3d07a --- /dev/null +++ b/ppdet/modeling/backbones/darknet.py @@ -0,0 +1,322 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from ppdet.modeling.ops import batch_norm, mish +from ..shape_spec import ShapeSpec + +__all__ = ['DarkNet', 'ConvBNLayer'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + norm_type='bn', + norm_decay=0., + act="leaky", + data_format='NCHW', + name=''): + """ + conv + bn + activation layer + + Args: + ch_in (int): input channel + ch_out (int): output channel + filter_size (int): filter size, default 3 + stride (int): stride, default 1 + groups (int): number of groups of conv layer, default 1 + padding (int): padding size, default 0 + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + act (str): activation function type, default 'leaky', which means leaky_relu + data_format (str): data format, NCHW or NHWC + """ + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + data_format=data_format, + bias_attr=False) + self.batch_norm = batch_norm( + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + self.act = act + + def forward(self, inputs): + out = self.conv(inputs) + out = self.batch_norm(out) + if self.act == 'leaky': + out = F.leaky_relu(out, 0.1) + elif self.act == 'mish': + out = mish(out) + return out + + +class DownSample(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=2, + padding=1, + norm_type='bn', + norm_decay=0., + data_format='NCHW'): + """ + downsample layer + + Args: + ch_in (int): input channel + ch_out (int): output channel + filter_size (int): filter size, default 3 + stride (int): stride, default 2 + padding (int): padding size, default 1 + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + data_format (str): data format, NCHW or NHWC + """ + + super(DownSample, self).__init__() + + self.conv_bn_layer = ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + self.ch_out = ch_out + + def forward(self, inputs): + out = self.conv_bn_layer(inputs) + return out + + +class BasicBlock(nn.Layer): + def __init__(self, + ch_in, + ch_out, + norm_type='bn', + norm_decay=0., + data_format='NCHW'): + """ + BasicBlock layer of DarkNet + + Args: + ch_in (int): input channel + ch_out (int): output channel + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + data_format (str): data format, NCHW or NHWC + """ + + super(BasicBlock, self).__init__() + + self.conv1 = ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=1, + padding=0, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + self.conv2 = ConvBNLayer( + ch_in=ch_out, + ch_out=ch_out * 2, + filter_size=3, + stride=1, + padding=1, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + + def forward(self, inputs): + conv1 = self.conv1(inputs) + conv2 = self.conv2(conv1) + out = paddle.add(x=inputs, y=conv2) + return out + + +class Blocks(nn.Layer): + def __init__(self, + ch_in, + ch_out, + count, + norm_type='bn', + norm_decay=0., + name=None, + data_format='NCHW'): + """ + Blocks layer, which consist of some BaickBlock layers + + Args: + ch_in (int): input channel + ch_out (int): output channel + count (int): number of BasicBlock layer + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(Blocks, self).__init__() + + self.basicblock0 = BasicBlock( + ch_in, + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + self.res_out_list = [] + for i in range(1, count): + block_name = '{}.{}'.format(name, i) + res_out = self.add_sublayer( + block_name, + BasicBlock( + ch_out * 2, + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format)) + self.res_out_list.append(res_out) + self.ch_out = ch_out + + def forward(self, inputs): + y = self.basicblock0(inputs) + for basic_block_i in self.res_out_list: + y = basic_block_i(y) + return y + + +DarkNet_cfg = {53: ([1, 2, 8, 8, 4])} + + +@register +@serializable +class DarkNet(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + depth=53, + freeze_at=-1, + return_idx=[2, 3, 4], + num_stages=5, + norm_type='bn', + norm_decay=0., + data_format='NCHW'): + """ + Darknet, see https://pjreddie.com/darknet/yolo/ + + Args: + depth (int): depth of network + freeze_at (int): freeze the backbone at which stage + filter_size (int): filter size, default 3 + return_idx (list): index of stages whose feature maps are returned + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + data_format (str): data format, NCHW or NHWC + """ + super(DarkNet, self).__init__() + self.depth = depth + self.freeze_at = freeze_at + self.return_idx = return_idx + self.num_stages = num_stages + self.stages = DarkNet_cfg[self.depth][0:num_stages] + + self.conv0 = ConvBNLayer( + ch_in=3, + ch_out=32, + filter_size=3, + stride=1, + padding=1, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + + self.downsample0 = DownSample( + ch_in=32, + ch_out=32 * 2, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format) + + self._out_channels = [] + self.darknet_conv_block_list = [] + self.downsample_list = [] + ch_in = [64, 128, 256, 512, 1024] + for i, stage in enumerate(self.stages): + name = 'stage.{}'.format(i) + conv_block = self.add_sublayer( + name, + Blocks( + int(ch_in[i]), + 32 * (2**i), + stage, + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format, + name=name)) + self.darknet_conv_block_list.append(conv_block) + if i in return_idx: + self._out_channels.append(64 * (2**i)) + for i in range(num_stages - 1): + down_name = 'stage.{}.downsample'.format(i) + downsample = self.add_sublayer( + down_name, + DownSample( + ch_in=32 * (2**(i + 1)), + ch_out=32 * (2**(i + 2)), + norm_type=norm_type, + norm_decay=norm_decay, + data_format=data_format)) + self.downsample_list.append(downsample) + + def forward(self, inputs): + x = inputs['image'] + + out = self.conv0(x) + out = self.downsample0(out) + blocks = [] + for i, conv_block_i in enumerate(self.darknet_conv_block_list): + out = conv_block_i(out) + if i == self.freeze_at: + out.stop_gradient = True + if i in self.return_idx: + blocks.append(out) + if i < self.num_stages - 1: + out = self.downsample_list[i](out) + return blocks + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/backbones/ghostnet.py b/ppdet/modeling/backbones/ghostnet.py new file mode 100644 index 0000000..16fb78c --- /dev/null +++ b/ppdet/modeling/backbones/ghostnet.py @@ -0,0 +1,476 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Uniform, KaimingNormal +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec +from .mobilenet_v3 import make_divisible, ConvBNLayer + +__all__ = ['GhostNet'] + + +class ExtraBlockDW(nn.Layer): + def __init__(self, + in_c, + ch_1, + ch_2, + stride, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=None): + super(ExtraBlockDW, self).__init__() + self.pointwise_conv = ConvBNLayer( + in_c=in_c, + out_c=ch_1, + filter_size=1, + stride=1, + padding=0, + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra1") + self.depthwise_conv = ConvBNLayer( + in_c=ch_1, + out_c=ch_2, + filter_size=3, + stride=stride, + padding=1, # + num_groups=int(ch_1), + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_dw") + self.normal_conv = ConvBNLayer( + in_c=ch_2, + out_c=ch_2, + filter_size=1, + stride=1, + padding=0, + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_sep") + + def forward(self, inputs): + x = self.pointwise_conv(inputs) + x = self.depthwise_conv(x) + x = self.normal_conv(x) + return x + + +class SEBlock(nn.Layer): + def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None): + super(SEBlock, self).__init__() + self.pool2d_gap = AdaptiveAvgPool2D(1) + self._num_channels = num_channels + stdv = 1.0 / math.sqrt(num_channels * 1.0) + med_ch = num_channels // reduction_ratio + self.squeeze = Linear( + num_channels, + med_ch, + weight_attr=ParamAttr( + learning_rate=lr_mult, + initializer=Uniform(-stdv, stdv), + name=name + "_1_weights"), + bias_attr=ParamAttr( + learning_rate=lr_mult, name=name + "_1_offset")) + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = Linear( + med_ch, + num_channels, + weight_attr=ParamAttr( + learning_rate=lr_mult, + initializer=Uniform(-stdv, stdv), + name=name + "_2_weights"), + bias_attr=ParamAttr( + learning_rate=lr_mult, name=name + "_2_offset")) + + def forward(self, inputs): + pool = self.pool2d_gap(inputs) + pool = paddle.squeeze(pool, axis=[2, 3]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = paddle.clip(x=excitation, min=0, max=1) + excitation = paddle.unsqueeze(excitation, axis=[2, 3]) + out = paddle.multiply(inputs, excitation) + return out + + +class GhostModule(nn.Layer): + def __init__(self, + in_channels, + output_channels, + kernel_size=1, + ratio=2, + dw_size=3, + stride=1, + relu=True, + lr_mult=1., + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=None): + super(GhostModule, self).__init__() + init_channels = int(math.ceil(output_channels / ratio)) + new_channels = int(init_channels * (ratio - 1)) + self.primary_conv = ConvBNLayer( + in_c=in_channels, + out_c=init_channels, + filter_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + num_groups=1, + act="relu" if relu else None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_primary_conv") + self.cheap_operation = ConvBNLayer( + in_c=init_channels, + out_c=new_channels, + filter_size=dw_size, + stride=1, + padding=int((dw_size - 1) // 2), + num_groups=init_channels, + act="relu" if relu else None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_cheap_operation") + + def forward(self, inputs): + x = self.primary_conv(inputs) + y = self.cheap_operation(x) + out = paddle.concat([x, y], axis=1) + return out + + +class GhostBottleneck(nn.Layer): + def __init__(self, + in_channels, + hidden_dim, + output_channels, + kernel_size, + stride, + use_se, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + return_list=False, + name=None): + super(GhostBottleneck, self).__init__() + self._stride = stride + self._use_se = use_se + self._num_channels = in_channels + self._output_channels = output_channels + self.return_list = return_list + + self.ghost_module_1 = GhostModule( + in_channels=in_channels, + output_channels=hidden_dim, + kernel_size=1, + stride=1, + relu=True, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_ghost_module_1") + if stride == 2: + self.depthwise_conv = ConvBNLayer( + in_c=hidden_dim, + out_c=hidden_dim, + filter_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + num_groups=hidden_dim, + act=None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + + "_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. + ) + if use_se: + self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se") + self.ghost_module_2 = GhostModule( + in_channels=hidden_dim, + output_channels=output_channels, + kernel_size=1, + relu=False, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_ghost_module_2") + if stride != 1 or in_channels != output_channels: + self.shortcut_depthwise = ConvBNLayer( + in_c=in_channels, + out_c=in_channels, + filter_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + num_groups=in_channels, + act=None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + + "_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. + ) + self.shortcut_conv = ConvBNLayer( + in_c=in_channels, + out_c=output_channels, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + act=None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_shortcut_conv") + + def forward(self, inputs): + y = self.ghost_module_1(inputs) + x = y + if self._stride == 2: + x = self.depthwise_conv(x) + if self._use_se: + x = self.se_block(x) + x = self.ghost_module_2(x) + + if self._stride == 1 and self._num_channels == self._output_channels: + shortcut = inputs + else: + shortcut = self.shortcut_depthwise(inputs) + shortcut = self.shortcut_conv(shortcut) + x = paddle.add(x=x, y=shortcut) + + if self.return_list: + return [y, x] + else: + return x + + +@register +@serializable +class GhostNet(nn.Layer): + __shared__ = ['norm_type'] + + def __init__( + self, + scale=1.3, + feature_maps=[6, 12, 15], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + conv_decay=0., + norm_type='bn', + norm_decay=0.0, + freeze_norm=False): + super(GhostNet, self).__init__() + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + if norm_type == 'sync_bn' and freeze_norm: + raise ValueError( + "The norm_type should not be sync_bn when freeze_norm is True") + self.feature_maps = feature_maps + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + + inplanes = 16 + self.cfgs = [ + # k, t, c, SE, s + [3, 16, 16, 0, 1], + [3, 48, 24, 0, 2], + [3, 72, 24, 0, 1], + [5, 72, 40, 1, 2], + [5, 120, 40, 1, 1], + [3, 240, 80, 0, 2], + [3, 200, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 480, 112, 1, 1], + [3, 672, 112, 1, 1], + [5, 672, 160, 1, 2], # SSDLite output + [5, 960, 160, 0, 1], + [5, 960, 160, 1, 1], + [5, 960, 160, 0, 1], + [5, 960, 160, 1, 1] + ] + self.scale = scale + conv1_out_ch = int(make_divisible(inplanes * self.scale, 4)) + self.conv1 = ConvBNLayer( + in_c=3, + out_c=conv1_out_ch, + filter_size=3, + stride=2, + padding=1, + num_groups=1, + act="relu", + lr_mult=1., + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv1") + + # build inverted residual blocks + self._out_channels = [] + self.ghost_bottleneck_list = [] + idx = 0 + inplanes = conv1_out_ch + for k, exp_size, c, use_se, s in self.cfgs: + lr_idx = min(idx // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + # for SSD/SSDLite, first head input is after ResidualUnit expand_conv + return_list = self.with_extra_blocks and idx + 2 in self.feature_maps + + ghost_bottleneck = self.add_sublayer( + "_ghostbottleneck_" + str(idx), + sublayer=GhostBottleneck( + in_channels=inplanes, + hidden_dim=int(make_divisible(exp_size * self.scale, 4)), + output_channels=int(make_divisible(c * self.scale, 4)), + kernel_size=k, + stride=s, + use_se=use_se, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + return_list=return_list, + name="_ghostbottleneck_" + str(idx))) + self.ghost_bottleneck_list.append(ghost_bottleneck) + inplanes = int(make_divisible(c * self.scale, 4)) + idx += 1 + self._update_out_channels( + int(make_divisible(exp_size * self.scale, 4)) + if return_list else inplanes, idx + 1, feature_maps) + + if self.with_extra_blocks: + self.extra_block_list = [] + extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4)) + lr_idx = min(idx // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + conv_extra = self.add_sublayer( + "conv" + str(idx + 2), + sublayer=ConvBNLayer( + in_c=inplanes, + out_c=extra_out_c, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + act="relu6", + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv" + str(idx + 2))) + self.extra_block_list.append(conv_extra) + idx += 1 + self._update_out_channels(extra_out_c, idx + 1, feature_maps) + + for j, block_filter in enumerate(self.extra_block_filters): + in_c = extra_out_c if j == 0 else self.extra_block_filters[j - + 1][1] + conv_extra = self.add_sublayer( + "conv" + str(idx + 2), + sublayer=ExtraBlockDW( + in_c, + block_filter[0], + block_filter[1], + stride=2, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name='conv' + str(idx + 2))) + self.extra_block_list.append(conv_extra) + idx += 1 + self._update_out_channels(block_filter[1], idx + 1, + feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + x = self.conv1(inputs['image']) + outs = [] + for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list): + x = ghost_bottleneck(x) + if idx + 2 in self.feature_maps: + if isinstance(x, list): + outs.append(x[0]) + x = x[1] + else: + outs.append(x) + + if not self.with_extra_blocks: + return outs + + for i, block in enumerate(self.extra_block_list): + idx = i + len(self.ghost_bottleneck_list) + x = block(x) + if idx + 2 in self.feature_maps: + outs.append(x) + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/backbones/hrnet.py b/ppdet/modeling/backbones/hrnet.py new file mode 100644 index 0000000..f93f5fd --- /dev/null +++ b/ppdet/modeling/backbones/hrnet.py @@ -0,0 +1,723 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.regularizer import L2Decay +from paddle import ParamAttr +from paddle.nn.initializer import Normal +from numbers import Integral +import math + +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['HRNet'] + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride=1, + norm_type='bn', + norm_groups=32, + use_dcn=False, + norm_decay=0., + freeze_norm=False, + act=None, + name=None): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn'] + + self.act = act + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=1, + weight_attr=ParamAttr( + name=name + "_weights", initializer=Normal( + mean=0., std=0.01)), + bias_attr=False) + + norm_lr = 0. if freeze_norm else 1. + + norm_name = name + '_bn' + param_attr = ParamAttr( + name=norm_name + "_scale", + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + bias_attr = ParamAttr( + name=norm_name + "_offset", + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + global_stats = True if freeze_norm else False + if norm_type in ['bn', 'sync_bn']: + self.norm = nn.BatchNorm( + ch_out, + param_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats, + moving_mean_name=norm_name + '_mean', + moving_variance_name=norm_name + '_variance') + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + norm_params = self.norm.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + + if self.act == 'relu': + out = F.relu(out) + return out + + +class Layer1(nn.Layer): + def __init__(self, + num_channels, + has_se=False, + norm_decay=0., + freeze_norm=True, + name=None): + super(Layer1, self).__init__() + + self.bottleneck_block_list = [] + + for i in range(4): + bottleneck_block = self.add_sublayer( + "block_{}_{}".format(name, i + 1), + BottleneckBlock( + num_channels=num_channels if i == 0 else 256, + num_filters=64, + has_se=has_se, + stride=1, + downsample=True if i == 0 else False, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + '_' + str(i + 1))) + self.bottleneck_block_list.append(bottleneck_block) + + def forward(self, input): + conv = input + for block_func in self.bottleneck_block_list: + conv = block_func(conv) + return conv + + +class TransitionLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + norm_decay=0., + freeze_norm=True, + name=None): + super(TransitionLayer, self).__init__() + + num_in = len(in_channels) + num_out = len(out_channels) + out = [] + self.conv_bn_func_list = [] + for i in range(num_out): + residual = None + if i < num_in: + if in_channels[i] != out_channels[i]: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + ConvNormLayer( + ch_in=in_channels[i], + ch_out=out_channels[i], + filter_size=3, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act='relu', + name=name + '_layer_' + str(i + 1))) + else: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + ConvNormLayer( + ch_in=in_channels[-1], + ch_out=out_channels[i], + filter_size=3, + stride=2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act='relu', + name=name + '_layer_' + str(i + 1))) + self.conv_bn_func_list.append(residual) + + def forward(self, input): + outs = [] + for idx, conv_bn_func in enumerate(self.conv_bn_func_list): + if conv_bn_func is None: + outs.append(input[idx]) + else: + if idx < len(input): + outs.append(conv_bn_func(input[idx])) + else: + outs.append(conv_bn_func(input[-1])) + return outs + + +class Branches(nn.Layer): + def __init__(self, + block_num, + in_channels, + out_channels, + has_se=False, + norm_decay=0., + freeze_norm=True, + name=None): + super(Branches, self).__init__() + + self.basic_block_list = [] + for i in range(len(out_channels)): + self.basic_block_list.append([]) + for j in range(block_num): + in_ch = in_channels[i] if j == 0 else out_channels[i] + basic_block_func = self.add_sublayer( + "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), + BasicBlock( + num_channels=in_ch, + num_filters=out_channels[i], + has_se=has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + '_branch_layer_' + str(i + 1) + '_' + + str(j + 1))) + self.basic_block_list[i].append(basic_block_func) + + def forward(self, inputs): + outs = [] + for idx, input in enumerate(inputs): + conv = input + basic_block_list = self.basic_block_list[idx] + for basic_block_func in basic_block_list: + conv = basic_block_func(conv) + outs.append(conv) + return outs + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + has_se, + stride=1, + downsample=False, + norm_decay=0., + freeze_norm=True, + name=None): + super(BottleneckBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = ConvNormLayer( + ch_in=num_channels, + ch_out=num_filters, + filter_size=1, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act="relu", + name=name + "_conv1") + self.conv2 = ConvNormLayer( + ch_in=num_filters, + ch_out=num_filters, + filter_size=3, + stride=stride, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act="relu", + name=name + "_conv2") + self.conv3 = ConvNormLayer( + ch_in=num_filters, + ch_out=num_filters * 4, + filter_size=1, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act=None, + name=name + "_conv3") + + if self.downsample: + self.conv_down = ConvNormLayer( + ch_in=num_channels, + ch_out=num_filters * 4, + filter_size=1, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act=None, + name=name + "_downsample") + + if self.has_se: + self.se = SELayer( + num_channels=num_filters * 4, + num_filters=num_filters * 4, + reduction_ratio=16, + name='fc' + name) + + def forward(self, input): + residual = input + conv1 = self.conv1(input) + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) + + if self.downsample: + residual = self.conv_down(input) + + if self.has_se: + conv3 = self.se(conv3) + + y = paddle.add(x=residual, y=conv3) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride=1, + has_se=False, + downsample=False, + norm_decay=0., + freeze_norm=True, + name=None): + super(BasicBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + self.conv1 = ConvNormLayer( + ch_in=num_channels, + ch_out=num_filters, + filter_size=3, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + stride=stride, + act="relu", + name=name + "_conv1") + self.conv2 = ConvNormLayer( + ch_in=num_filters, + ch_out=num_filters, + filter_size=3, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + stride=1, + act=None, + name=name + "_conv2") + + if self.downsample: + self.conv_down = ConvNormLayer( + ch_in=num_channels, + ch_out=num_filters * 4, + filter_size=1, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act=None, + name=name + "_downsample") + + if self.has_se: + self.se = SELayer( + num_channels=num_filters, + num_filters=num_filters, + reduction_ratio=16, + name='fc' + name) + + def forward(self, input): + residual = input + conv1 = self.conv1(input) + conv2 = self.conv2(conv1) + + if self.downsample: + residual = self.conv_down(input) + + if self.has_se: + conv2 = self.se(conv2) + + y = paddle.add(x=residual, y=conv2) + y = F.relu(y) + return y + + +class SELayer(nn.Layer): + def __init__(self, num_channels, num_filters, reduction_ratio, name=None): + super(SELayer, self).__init__() + + self.pool2d_gap = AdaptiveAvgPool2D(1) + + self._num_channels = num_channels + + med_ch = int(num_channels / reduction_ratio) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self.squeeze = Linear( + num_channels, + med_ch, + weight_attr=ParamAttr( + initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"), + bias_attr=ParamAttr(name=name + '_sqz_offset')) + + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = Linear( + med_ch, + num_filters, + weight_attr=ParamAttr( + initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"), + bias_attr=ParamAttr(name=name + '_exc_offset')) + + def forward(self, input): + pool = self.pool2d_gap(input) + pool = paddle.squeeze(pool, axis=[2, 3]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = F.sigmoid(excitation) + excitation = paddle.unsqueeze(excitation, axis=[2, 3]) + out = input * excitation + return out + + +class Stage(nn.Layer): + def __init__(self, + num_channels, + num_modules, + num_filters, + has_se=False, + norm_decay=0., + freeze_norm=True, + multi_scale_output=True, + name=None): + super(Stage, self).__init__() + + self._num_modules = num_modules + self.stage_func_list = [] + for i in range(num_modules): + if i == num_modules - 1 and not multi_scale_output: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_filters=num_filters, + has_se=has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + multi_scale_output=False, + name=name + '_' + str(i + 1))) + else: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_filters=num_filters, + has_se=has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + '_' + str(i + 1))) + + self.stage_func_list.append(stage_func) + + def forward(self, input): + out = input + for idx in range(self._num_modules): + out = self.stage_func_list[idx](out) + return out + + +class HighResolutionModule(nn.Layer): + def __init__(self, + num_channels, + num_filters, + has_se=False, + multi_scale_output=True, + norm_decay=0., + freeze_norm=True, + name=None): + super(HighResolutionModule, self).__init__() + self.branches_func = Branches( + block_num=4, + in_channels=num_channels, + out_channels=num_filters, + has_se=has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name) + + self.fuse_func = FuseLayers( + in_channels=num_filters, + out_channels=num_filters, + multi_scale_output=multi_scale_output, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name) + + def forward(self, input): + out = self.branches_func(input) + out = self.fuse_func(out) + return out + + +class FuseLayers(nn.Layer): + def __init__(self, + in_channels, + out_channels, + multi_scale_output=True, + norm_decay=0., + freeze_norm=True, + name=None): + super(FuseLayers, self).__init__() + + self._actual_ch = len(in_channels) if multi_scale_output else 1 + self._in_channels = in_channels + + self.residual_func_list = [] + for i in range(self._actual_ch): + for j in range(len(in_channels)): + residual_func = None + if j > i: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), + ConvNormLayer( + ch_in=in_channels[j], + ch_out=out_channels[i], + filter_size=1, + stride=1, + act=None, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + '_layer_' + str(i + 1) + '_' + + str(j + 1))) + self.residual_func_list.append(residual_func) + elif j < i: + pre_num_filters = in_channels[j] + for k in range(i - j): + if k == i - j - 1: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + ConvNormLayer( + ch_in=pre_num_filters, + ch_out=out_channels[i], + filter_size=3, + stride=2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act=None, + name=name + '_layer_' + str(i + 1) + '_' + + str(j + 1) + '_' + str(k + 1))) + pre_num_filters = out_channels[i] + else: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + ConvNormLayer( + ch_in=pre_num_filters, + ch_out=out_channels[j], + filter_size=3, + stride=2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act="relu", + name=name + '_layer_' + str(i + 1) + '_' + + str(j + 1) + '_' + str(k + 1))) + pre_num_filters = out_channels[j] + self.residual_func_list.append(residual_func) + + def forward(self, input): + outs = [] + residual_func_idx = 0 + for i in range(self._actual_ch): + residual = input[i] + for j in range(len(self._in_channels)): + if j > i: + y = self.residual_func_list[residual_func_idx](input[j]) + residual_func_idx += 1 + y = F.interpolate(y, scale_factor=2**(j - i)) + residual = paddle.add(x=residual, y=y) + elif j < i: + y = input[j] + for k in range(i - j): + y = self.residual_func_list[residual_func_idx](y) + residual_func_idx += 1 + + residual = paddle.add(x=residual, y=y) + residual = F.relu(residual) + outs.append(residual) + + return outs + + +@register +class HRNet(nn.Layer): + """ + HRNet, see https://arxiv.org/abs/1908.07919 + + Args: + width (int): the width of HRNet + has_se (bool): whether to add SE block for each stage + freeze_at (int): the stage to freeze + freeze_norm (bool): whether to freeze norm in HRNet + norm_decay (float): weight decay for normalization layer weights + return_idx (List): the stage to return + """ + + def __init__(self, + width=18, + has_se=False, + freeze_at=0, + freeze_norm=True, + norm_decay=0., + return_idx=[0, 1, 2, 3]): + super(HRNet, self).__init__() + + self.width = width + self.has_se = has_se + if isinstance(return_idx, Integral): + return_idx = [return_idx] + + assert len(return_idx) > 0, "need one or more return index" + self.freeze_at = freeze_at + self.return_idx = return_idx + + self.channels = { + 18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]], + 30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]], + 32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]], + 40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]], + 44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]], + 48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]], + 60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]], + 64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]] + } + + channels_2, channels_3, channels_4 = self.channels[width] + num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3 + self._out_channels = channels_4 + self._out_strides = [4, 8, 16, 32] + + self.conv_layer1_1 = ConvNormLayer( + ch_in=3, + ch_out=64, + filter_size=3, + stride=2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act='relu', + name="layer1_1") + + self.conv_layer1_2 = ConvNormLayer( + ch_in=64, + ch_out=64, + filter_size=3, + stride=2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + act='relu', + name="layer1_2") + + self.la1 = Layer1( + num_channels=64, + has_se=has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="layer2") + + self.tr1 = TransitionLayer( + in_channels=[256], + out_channels=channels_2, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="tr1") + + self.st2 = Stage( + num_channels=channels_2, + num_modules=num_modules_2, + num_filters=channels_2, + has_se=self.has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="st2") + + self.tr2 = TransitionLayer( + in_channels=channels_2, + out_channels=channels_3, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="tr2") + + self.st3 = Stage( + num_channels=channels_3, + num_modules=num_modules_3, + num_filters=channels_3, + has_se=self.has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="st3") + + self.tr3 = TransitionLayer( + in_channels=channels_3, + out_channels=channels_4, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="tr3") + self.st4 = Stage( + num_channels=channels_4, + num_modules=num_modules_4, + num_filters=channels_4, + has_se=self.has_se, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="st4") + + def forward(self, inputs): + x = inputs['image'] + conv1 = self.conv_layer1_1(x) + conv2 = self.conv_layer1_2(conv1) + + la1 = self.la1(conv2) + tr1 = self.tr1([la1]) + st2 = self.st2(tr1) + tr2 = self.tr2(st2) + + st3 = self.st3(tr2) + tr3 = self.tr3(st3) + + st4 = self.st4(tr3) + + res = [] + for i, layer in enumerate(st4): + if i == self.freeze_at: + layer.stop_gradient = True + if i in self.return_idx: + res.append(layer) + + return res + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] diff --git a/ppdet/modeling/backbones/mobilenet_v1.py b/ppdet/modeling/backbones/mobilenet_v1.py new file mode 100644 index 0000000..cecc6a5 --- /dev/null +++ b/ppdet/modeling/backbones/mobilenet_v1.py @@ -0,0 +1,410 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['MobileNet'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + num_groups=1, + act='relu', + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr( + learning_rate=conv_lr, + initializer=KaimingNormal(), + regularizer=L2Decay(conv_decay)), + bias_attr=False) + + param_attr = ParamAttr(regularizer=L2Decay(norm_decay)) + bias_attr = ParamAttr(regularizer=L2Decay(norm_decay)) + if norm_type == 'sync_bn': + self._batch_norm = nn.SyncBatchNorm( + out_channels, weight_attr=param_attr, bias_attr=bias_attr) + else: + self._batch_norm = nn.BatchNorm( + out_channels, + act=None, + param_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=False) + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + if self.act == "relu": + x = F.relu(x) + elif self.act == "relu6": + x = F.relu6(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + in_channels, + out_channels1, + out_channels2, + num_groups, + stride, + scale, + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(DepthwiseSeparable, self).__init__() + + self._depthwise_conv = ConvBNLayer( + in_channels, + int(out_channels1 * scale), + kernel_size=3, + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_dw") + + self._pointwise_conv = ConvBNLayer( + int(out_channels1 * scale), + int(out_channels2 * scale), + kernel_size=1, + stride=1, + padding=0, + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_sep") + + def forward(self, x): + x = self._depthwise_conv(x) + x = self._pointwise_conv(x) + return x + + +class ExtraBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels1, + out_channels2, + num_groups=1, + stride=2, + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(ExtraBlock, self).__init__() + + self.pointwise_conv = ConvBNLayer( + in_channels, + int(out_channels1), + kernel_size=1, + stride=1, + padding=0, + num_groups=int(num_groups), + act='relu6', + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_extra1") + + self.normal_conv = ConvBNLayer( + int(out_channels1), + int(out_channels2), + kernel_size=3, + stride=stride, + padding=1, + num_groups=int(num_groups), + act='relu6', + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_extra2") + + def forward(self, x): + x = self.pointwise_conv(x) + x = self.normal_conv(x) + return x + + +@register +@serializable +class MobileNet(nn.Layer): + __shared__ = ['norm_type'] + + def __init__(self, + norm_type='bn', + norm_decay=0., + conv_decay=0., + scale=1, + conv_learning_rate=1.0, + feature_maps=[4, 6, 13], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], + [64, 128]]): + super(MobileNet, self).__init__() + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + self.feature_maps = feature_maps + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + + self._out_channels = [] + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=int(32 * scale), + kernel_size=3, + stride=2, + padding=1, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv1") + + self.dwsl = [] + dws21 = self.add_sublayer( + "conv2_1", + sublayer=DepthwiseSeparable( + in_channels=int(32 * scale), + out_channels1=32, + out_channels2=64, + num_groups=32, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv2_1")) + self.dwsl.append(dws21) + self._update_out_channels(64, len(self.dwsl), feature_maps) + dws22 = self.add_sublayer( + "conv2_2", + sublayer=DepthwiseSeparable( + in_channels=int(64 * scale), + out_channels1=64, + out_channels2=128, + num_groups=64, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv2_2")) + self.dwsl.append(dws22) + self._update_out_channels(128, len(self.dwsl), feature_maps) + # 1/4 + dws31 = self.add_sublayer( + "conv3_1", + sublayer=DepthwiseSeparable( + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=128, + num_groups=128, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv3_1")) + self.dwsl.append(dws31) + self._update_out_channels(128, len(self.dwsl), feature_maps) + dws32 = self.add_sublayer( + "conv3_2", + sublayer=DepthwiseSeparable( + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=256, + num_groups=128, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv3_2")) + self.dwsl.append(dws32) + self._update_out_channels(256, len(self.dwsl), feature_maps) + # 1/8 + dws41 = self.add_sublayer( + "conv4_1", + sublayer=DepthwiseSeparable( + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=256, + num_groups=256, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv4_1")) + self.dwsl.append(dws41) + self._update_out_channels(256, len(self.dwsl), feature_maps) + dws42 = self.add_sublayer( + "conv4_2", + sublayer=DepthwiseSeparable( + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=512, + num_groups=256, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv4_2")) + self.dwsl.append(dws42) + self._update_out_channels(512, len(self.dwsl), feature_maps) + # 1/16 + for i in range(5): + tmp = self.add_sublayer( + "conv5_" + str(i + 1), + sublayer=DepthwiseSeparable( + in_channels=512, + out_channels1=512, + out_channels2=512, + num_groups=512, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv5_" + str(i + 1))) + self.dwsl.append(tmp) + self._update_out_channels(512, len(self.dwsl), feature_maps) + dws56 = self.add_sublayer( + "conv5_6", + sublayer=DepthwiseSeparable( + in_channels=int(512 * scale), + out_channels1=512, + out_channels2=1024, + num_groups=512, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv5_6")) + self.dwsl.append(dws56) + self._update_out_channels(1024, len(self.dwsl), feature_maps) + # 1/32 + dws6 = self.add_sublayer( + "conv6", + sublayer=DepthwiseSeparable( + in_channels=int(1024 * scale), + out_channels1=1024, + out_channels2=1024, + num_groups=1024, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv6")) + self.dwsl.append(dws6) + self._update_out_channels(1024, len(self.dwsl), feature_maps) + + if self.with_extra_blocks: + self.extra_blocks = [] + for i, block_filter in enumerate(self.extra_block_filters): + in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1] + conv_extra = self.add_sublayer( + "conv7_" + str(i + 1), + sublayer=ExtraBlock( + in_c, + block_filter[0], + block_filter[1], + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv7_" + str(i + 1))) + self.extra_blocks.append(conv_extra) + self._update_out_channels( + block_filter[1], + len(self.dwsl) + len(self.extra_blocks), feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + outs = [] + y = self.conv1(inputs['image']) + for i, block in enumerate(self.dwsl): + y = block(y) + if i + 1 in self.feature_maps: + outs.append(y) + + if not self.with_extra_blocks: + return outs + + y = outs[-1] + for i, block in enumerate(self.extra_blocks): + idx = i + len(self.dwsl) + y = block(y) + if idx + 1 in self.feature_maps: + outs.append(y) + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/backbones/mobilenet_v3.py b/ppdet/modeling/backbones/mobilenet_v3.py new file mode 100644 index 0000000..d7178c9 --- /dev/null +++ b/ppdet/modeling/backbones/mobilenet_v3.py @@ -0,0 +1,496 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['MobileNetV3'] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_c, + out_c, + filter_size, + stride, + padding, + num_groups=1, + act=None, + lr_mult=1., + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=""): + super(ConvBNLayer, self).__init__() + self.act = act + self.conv = nn.Conv2D( + in_channels=in_c, + out_channels=out_c, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr( + learning_rate=lr_mult, + regularizer=L2Decay(conv_decay), + name=name + "_weights"), + bias_attr=False) + + norm_lr = 0. if freeze_norm else lr_mult + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + name=name + "_bn_scale", + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + name=name + "_bn_offset", + trainable=False if freeze_norm else True) + global_stats = True if freeze_norm else False + if norm_type == 'sync_bn': + self.bn = nn.SyncBatchNorm( + out_c, weight_attr=param_attr, bias_attr=bias_attr) + else: + self.bn = nn.BatchNorm( + out_c, + act=None, + param_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats, + moving_mean_name=name + '_bn_mean', + moving_variance_name=name + '_bn_variance') + norm_params = self.bn.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + if self.act == "relu": + x = F.relu(x) + elif self.act == "relu6": + x = F.relu6(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + else: + raise NotImplementedError( + "The activation function is selected incorrectly.") + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_c, + mid_c, + out_c, + filter_size, + stride, + use_se, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + act=None, + return_list=False, + name=''): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_c == out_c + self.use_se = use_se + self.return_list = return_list + + self.expand_conv = ConvBNLayer( + in_c=in_c, + out_c=mid_c, + filter_size=1, + stride=1, + padding=0, + act=act, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_expand") + self.bottleneck_conv = ConvBNLayer( + in_c=mid_c, + out_c=mid_c, + filter_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2), + num_groups=mid_c, + act=act, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_depthwise") + if self.use_se: + self.mid_se = SEModule( + mid_c, lr_mult, conv_decay, name=name + "_se") + self.linear_conv = ConvBNLayer( + in_c=mid_c, + out_c=out_c, + filter_size=1, + stride=1, + padding=0, + act=None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_linear") + + def forward(self, inputs): + y = self.expand_conv(inputs) + x = self.bottleneck_conv(y) + if self.use_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(inputs, x) + if self.return_list: + return [y, x] + else: + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + mid_channels = int(channel // reduction) + self.conv1 = nn.Conv2D( + in_channels=channel, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr( + learning_rate=lr_mult, + regularizer=L2Decay(conv_decay), + name=name + "_1_weights"), + bias_attr=ParamAttr( + learning_rate=lr_mult, + regularizer=L2Decay(conv_decay), + name=name + "_1_offset")) + self.conv2 = nn.Conv2D( + in_channels=mid_channels, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr( + learning_rate=lr_mult, + regularizer=L2Decay(conv_decay), + name=name + "_2_weights"), + bias_attr=ParamAttr( + learning_rate=lr_mult, + regularizer=L2Decay(conv_decay), + name=name + "_2_offset")) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) + return paddle.multiply(x=inputs, y=outputs) + + +class ExtraBlockDW(nn.Layer): + def __init__(self, + in_c, + ch_1, + ch_2, + stride, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=None): + super(ExtraBlockDW, self).__init__() + self.pointwise_conv = ConvBNLayer( + in_c=in_c, + out_c=ch_1, + filter_size=1, + stride=1, + padding='SAME', + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra1") + self.depthwise_conv = ConvBNLayer( + in_c=ch_1, + out_c=ch_2, + filter_size=3, + stride=stride, + padding='SAME', + num_groups=int(ch_1), + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_dw") + self.normal_conv = ConvBNLayer( + in_c=ch_2, + out_c=ch_2, + filter_size=1, + stride=1, + padding='SAME', + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_sep") + + def forward(self, inputs): + x = self.pointwise_conv(inputs) + x = self.depthwise_conv(x) + x = self.normal_conv(x) + return x + + +@register +@serializable +class MobileNetV3(nn.Layer): + __shared__ = ['norm_type'] + + def __init__( + self, + scale=1.0, + model_name="large", + feature_maps=[6, 12, 15], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + conv_decay=0.0, + multiplier=1.0, + norm_type='bn', + norm_decay=0.0, + freeze_norm=False): + super(MobileNetV3, self).__init__() + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + if norm_type == 'sync_bn' and freeze_norm: + raise ValueError( + "The norm_type should not be sync_bn when freeze_norm is True") + self.feature_maps = feature_maps + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + + inplanes = 16 + if model_name == "large": + self.cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", 2], # RCNN output + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], # YOLOv3 output + [3, 240, 80, False, "hard_swish", 2], # RCNN output + [3, 200, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 480, 112, True, "hard_swish", 1], + [3, 672, 112, True, "hard_swish", 1], # YOLOv3 output + [5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output + [5, 960, 160, True, "hard_swish", 1], + [5, 960, 160, True, "hard_swish", 1], # YOLOv3 output + ] + elif model_name == "small": + self.cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], # RCNN output + [3, 88, 24, False, "relu", 1], # YOLOv3 output + [5, 96, 40, True, "hard_swish", 2], # RCNN output + [5, 240, 40, True, "hard_swish", 1], + [5, 240, 40, True, "hard_swish", 1], + [5, 120, 48, True, "hard_swish", 1], + [5, 144, 48, True, "hard_swish", 1], # YOLOv3 output + [5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output + [5, 576, 96, True, "hard_swish", 1], + [5, 576, 96, True, "hard_swish", 1], # YOLOv3 output + ] + else: + raise NotImplementedError( + "mode[{}_model] is not implemented!".format(model_name)) + + if multiplier != 1.0: + self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier) + self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier) + self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier) + self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier) + self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier) + + self.conv1 = ConvBNLayer( + in_c=3, + out_c=make_divisible(inplanes * scale), + filter_size=3, + stride=2, + padding=1, + num_groups=1, + act="hard_swish", + lr_mult=lr_mult_list[0], + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv1") + + self._out_channels = [] + self.block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in self.cfg: + lr_idx = min(i // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + # for SSD/SSDLite, first head input is after ResidualUnit expand_conv + return_list = self.with_extra_blocks and i + 2 in self.feature_maps + + block = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ResidualUnit( + in_c=inplanes, + mid_c=make_divisible(scale * exp), + out_c=make_divisible(scale * c), + filter_size=k, + stride=s, + use_se=se, + act=nl, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + return_list=return_list, + name="conv" + str(i + 2))) + self.block_list.append(block) + inplanes = make_divisible(scale * c) + i += 1 + self._update_out_channels( + make_divisible(scale * exp) + if return_list else inplanes, i + 1, feature_maps) + + if self.with_extra_blocks: + self.extra_block_list = [] + extra_out_c = make_divisible(scale * self.cfg[-1][1]) + lr_idx = min(i // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + conv_extra = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ConvBNLayer( + in_c=inplanes, + out_c=extra_out_c, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + act="hard_swish", + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv" + str(i + 2))) + self.extra_block_list.append(conv_extra) + i += 1 + self._update_out_channels(extra_out_c, i + 1, feature_maps) + + for j, block_filter in enumerate(self.extra_block_filters): + in_c = extra_out_c if j == 0 else self.extra_block_filters[j - + 1][1] + conv_extra = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ExtraBlockDW( + in_c, + block_filter[0], + block_filter[1], + stride=2, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name='conv' + str(i + 2))) + self.extra_block_list.append(conv_extra) + i += 1 + self._update_out_channels(block_filter[1], i + 1, feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + x = self.conv1(inputs['image']) + outs = [] + for idx, block in enumerate(self.block_list): + x = block(x) + if idx + 2 in self.feature_maps: + if isinstance(x, list): + outs.append(x[0]) + x = x[1] + else: + outs.append(x) + + if not self.with_extra_blocks: + return outs + + for i, block in enumerate(self.extra_block_list): + idx = i + len(self.block_list) + x = block(x) + if idx + 2 in self.feature_maps: + outs.append(x) + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/backbones/name_adapter.py b/ppdet/modeling/backbones/name_adapter.py new file mode 100644 index 0000000..4afbb9b --- /dev/null +++ b/ppdet/modeling/backbones/name_adapter.py @@ -0,0 +1,69 @@ +class NameAdapter(object): + """Fix the backbones variable names for pretrained weight""" + + def __init__(self, model): + super(NameAdapter, self).__init__() + self.model = model + + @property + def model_type(self): + return getattr(self.model, '_model_type', '') + + @property + def variant(self): + return getattr(self.model, 'variant', '') + + def fix_conv_norm_name(self, name): + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + # the naming rule is same as pretrained weight + if self.model_type == 'SEResNeXt': + bn_name = name + "_bn" + return bn_name + + def fix_shortcut_name(self, name): + if self.model_type == 'SEResNeXt': + name = 'conv' + name + '_prj' + return name + + def fix_bottleneck_name(self, name): + if self.model_type == 'SEResNeXt': + conv_name1 = 'conv' + name + '_x1' + conv_name2 = 'conv' + name + '_x2' + conv_name3 = 'conv' + name + '_x3' + shortcut_name = name + else: + conv_name1 = name + "_branch2a" + conv_name2 = name + "_branch2b" + conv_name3 = name + "_branch2c" + shortcut_name = name + "_branch1" + return conv_name1, conv_name2, conv_name3, shortcut_name + + def fix_basicblock_name(self, name): + if self.model_type == 'SEResNeXt': + conv_name1 = 'conv' + name + '_x1' + conv_name2 = 'conv' + name + '_x2' + shortcut_name = name + else: + conv_name1 = name + "_branch2a" + conv_name2 = name + "_branch2b" + shortcut_name = name + "_branch1" + return conv_name1, conv_name2, shortcut_name + + def fix_layer_warp_name(self, stage_num, count, i): + name = 'res' + str(stage_num) + if count > 10 and stage_num == 4: + if i == 0: + conv_name = name + "a" + else: + conv_name = name + "b" + str(i) + else: + conv_name = name + chr(ord("a") + i) + if self.model_type == 'SEResNeXt': + conv_name = str(stage_num + 2) + '_' + str(i + 1) + return conv_name + + def fix_c1_stage_name(self): + return "res_conv1" if self.model_type == 'ResNeXt' else "conv1" diff --git a/ppdet/modeling/backbones/resnet.py b/ppdet/modeling/backbones/resnet.py new file mode 100644 index 0000000..6be2fc6 --- /dev/null +++ b/ppdet/modeling/backbones/resnet.py @@ -0,0 +1,606 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from numbers import Integral + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Uniform +from paddle import ParamAttr +from paddle.nn.initializer import Constant +from paddle.vision.ops import DeformConv2D +from .name_adapter import NameAdapter +from ..shape_spec import ShapeSpec + +__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck'] + +ResNet_cfg = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], +} + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + act=None, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + lr=1.0, + dcn_v2=False): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn'] + self.norm_type = norm_type + self.act = act + self.dcn_v2 = dcn_v2 + + if not self.dcn_v2: + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(learning_rate=lr), + bias_attr=False) + else: + self.offset_channel = 2 * filter_size**2 + self.mask_channel = filter_size**2 + + self.conv_offset = nn.Conv2D( + in_channels=ch_in, + out_channels=3 * filter_size**2, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.)), + bias_attr=ParamAttr(initializer=Constant(0.))) + self.conv = DeformConv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + dilation=1, + groups=groups, + weight_attr=ParamAttr(learning_rate=lr), + bias_attr=False) + + norm_lr = 0. if freeze_norm else lr + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + + global_stats = True if freeze_norm else False + if norm_type == 'sync_bn': + self.norm = nn.SyncBatchNorm( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + else: + self.norm = nn.BatchNorm( + ch_out, + act=None, + param_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats) + norm_params = self.norm.parameters() + + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + def forward(self, inputs): + if not self.dcn_v2: + out = self.conv(inputs) + else: + offset_mask = self.conv_offset(inputs) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + out = self.conv(inputs, offset, mask=mask) + + if self.norm_type in ['bn', 'sync_bn']: + out = self.norm(out) + if self.act: + out = getattr(F, self.act)(out) + return out + + +class SELayer(nn.Layer): + def __init__(self, ch, reduction_ratio=16): + super(SELayer, self).__init__() + self.pool = nn.AdaptiveAvgPool2D(1) + stdv = 1.0 / math.sqrt(ch) + c_ = ch // reduction_ratio + self.squeeze = nn.Linear( + ch, + c_, + weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=True) + + stdv = 1.0 / math.sqrt(c_) + self.extract = nn.Linear( + c_, + ch, + weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=True) + + def forward(self, inputs): + out = self.pool(inputs) + out = paddle.squeeze(out, axis=[2, 3]) + out = self.squeeze(out) + out = F.relu(out) + out = self.extract(out) + out = F.sigmoid(out) + out = paddle.unsqueeze(out, axis=[2, 3]) + scale = out * inputs + return scale + + +class BasicBlock(nn.Layer): + + expansion = 1 + + def __init__(self, + ch_in, + ch_out, + stride, + shortcut, + variant='b', + groups=1, + base_width=64, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(BasicBlock, self).__init__() + assert dcn_v2 is False, "Not implemented yet." + assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64' + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential() + self.short.add_sublayer( + 'pool', + nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True)) + self.short.add_sublayer( + 'conv', + ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr)) + else: + self.short = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=stride, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2a = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=3, + stride=stride, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2b = ConvNormLayer( + ch_in=ch_out, + ch_out=ch_out, + filter_size=3, + stride=1, + act=None, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.std_senet = std_senet + if self.std_senet: + self.se = SELayer(ch_out) + + def forward(self, inputs): + out = self.branch2a(inputs) + out = self.branch2b(out) + if self.std_senet: + out = self.se(out) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + out = paddle.add(x=out, y=short) + out = F.relu(out) + + return out + + +class BottleNeck(nn.Layer): + + expansion = 4 + + def __init__(self, + ch_in, + ch_out, + stride, + shortcut, + variant='b', + groups=1, + base_width=4, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(BottleNeck, self).__init__() + if variant == 'a': + stride1, stride2 = stride, 1 + else: + stride1, stride2 = 1, stride + + # ResNeXt + width = int(ch_out * (base_width / 64.)) * groups + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential() + self.short.add_sublayer( + 'pool', + nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True)) + self.short.add_sublayer( + 'conv', + ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr)) + else: + self.short = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=stride, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2a = ConvNormLayer( + ch_in=ch_in, + ch_out=width, + filter_size=1, + stride=stride1, + groups=1, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2b = ConvNormLayer( + ch_in=width, + ch_out=width, + filter_size=3, + stride=stride2, + groups=groups, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr, + dcn_v2=dcn_v2) + + self.branch2c = ConvNormLayer( + ch_in=width, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=1, + groups=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.std_senet = std_senet + if self.std_senet: + self.se = SELayer(ch_out * self.expansion) + + def forward(self, inputs): + + out = self.branch2a(inputs) + out = self.branch2b(out) + out = self.branch2c(out) + + if self.std_senet: + out = self.se(out) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + out = paddle.add(x=out, y=short) + out = F.relu(out) + + return out + + +class Blocks(nn.Layer): + def __init__(self, + block, + ch_in, + ch_out, + count, + name_adapter, + stage_num, + variant='b', + groups=1, + base_width=64, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(Blocks, self).__init__() + + self.blocks = [] + for i in range(count): + conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i) + layer = self.add_sublayer( + conv_name, + block( + ch_in=ch_in, + ch_out=ch_out, + stride=2 if i == 0 and stage_num != 2 else 1, + shortcut=False if i == 0 else True, + variant=variant, + groups=groups, + base_width=base_width, + lr=lr, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + dcn_v2=dcn_v2, + std_senet=std_senet)) + self.blocks.append(layer) + if i == 0: + ch_in = ch_out * block.expansion + + def forward(self, inputs): + block_out = inputs + for block in self.blocks: + block_out = block(block_out) + return block_out + + +@register +@serializable +class ResNet(nn.Layer): + __shared__ = ['norm_type'] + + def __init__(self, + depth=50, + ch_in=64, + variant='b', + lr_mult_list=[1.0, 1.0, 1.0, 1.0], + groups=1, + base_width=64, + norm_type='bn', + norm_decay=0, + freeze_norm=True, + freeze_at=0, + return_idx=[0, 1, 2, 3], + dcn_v2_stages=[-1], + num_stages=4, + std_senet=False): + """ + Residual Network, see https://arxiv.org/abs/1512.03385 + + Args: + depth (int): ResNet depth, should be 18, 34, 50, 101, 152. + ch_in (int): output channel of first stage, default 64 + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), + lower learning rate ratio is need for pretrained model + got using distillation(default as [1.0, 1.0, 1.0, 1.0]). + groups (int): group convolution cardinality + base_width (int): base width of each group convolution + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + norm_decay (float): weight decay for normalization layer weights + freeze_norm (bool): freeze normalization layers + freeze_at (int): freeze the backbone at which stage + return_idx (list): index of the stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + num_stages (int): total num of stages + std_senet (bool): whether use senet, default True + """ + super(ResNet, self).__init__() + self._model_type = 'ResNet' if groups == 1 else 'ResNeXt' + assert num_stages >= 1 and num_stages <= 4 + self.depth = depth + self.variant = variant + self.groups = groups + self.base_width = base_width + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + self.freeze_at = freeze_at + if isinstance(return_idx, Integral): + return_idx = [return_idx] + assert max(return_idx) < num_stages, \ + 'the maximum return index must smaller than num_stages, ' \ + 'but received maximum return index is {} and num_stages ' \ + 'is {}'.format(max(return_idx), num_stages) + self.return_idx = return_idx + self.num_stages = num_stages + assert len(lr_mult_list) == 4, \ + "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) + if isinstance(dcn_v2_stages, Integral): + dcn_v2_stages = [dcn_v2_stages] + assert max(dcn_v2_stages) < num_stages + + if isinstance(dcn_v2_stages, Integral): + dcn_v2_stages = [dcn_v2_stages] + assert max(dcn_v2_stages) < num_stages + self.dcn_v2_stages = dcn_v2_stages + + block_nums = ResNet_cfg[depth] + na = NameAdapter(self) + + conv1_name = na.fix_c1_stage_name() + if variant in ['c', 'd']: + conv_def = [ + [3, ch_in // 2, 3, 2, "conv1_1"], + [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], + [ch_in // 2, ch_in, 3, 1, "conv1_3"], + ] + else: + conv_def = [[3, ch_in, 7, 2, conv1_name]] + self.conv1 = nn.Sequential() + for (c_in, c_out, k, s, _name) in conv_def: + self.conv1.add_sublayer( + _name, + ConvNormLayer( + ch_in=c_in, + ch_out=c_out, + filter_size=k, + stride=s, + groups=1, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=1.0)) + + self.ch_in = ch_in + ch_out_list = [64, 128, 256, 512] + block = BottleNeck if depth >= 50 else BasicBlock + + self._out_channels = [block.expansion * v for v in ch_out_list] + self._out_strides = [4, 8, 16, 32] + + self.res_layers = [] + for i in range(num_stages): + lr_mult = lr_mult_list[i] + stage_num = i + 2 + res_name = "res{}".format(stage_num) + res_layer = self.add_sublayer( + res_name, + Blocks( + block, + self.ch_in, + ch_out_list[i], + count=block_nums[i], + name_adapter=na, + stage_num=stage_num, + variant=variant, + groups=groups, + base_width=base_width, + lr=lr_mult, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + dcn_v2=(i in self.dcn_v2_stages), + std_senet=std_senet)) + self.res_layers.append(res_layer) + self.ch_in = self._out_channels[i] + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] + + def forward(self, inputs): + x = inputs['image'] + conv1 = self.conv1(x) + x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) + outs = [] + for idx, stage in enumerate(self.res_layers): + x = stage(x) + if idx == self.freeze_at: + x.stop_gradient = True + if idx in self.return_idx: + outs.append(x) + return outs + + +@register +class Res5Head(nn.Layer): + def __init__(self, depth=50): + super(Res5Head, self).__init__() + feat_in, feat_out = [1024, 512] + if depth < 50: + feat_in = 256 + na = NameAdapter(self) + block = BottleNeck if depth >= 50 else BasicBlock + self.res5 = Blocks( + block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5) + self.feat_out = feat_out if depth < 50 else feat_out * 4 + + @property + def out_shape(self): + return [ShapeSpec( + channels=self.feat_out, + stride=16, )] + + def forward(self, roi_feat, stage=0): + y = self.res5(roi_feat) + return y diff --git a/ppdet/modeling/backbones/senet.py b/ppdet/modeling/backbones/senet.py new file mode 100644 index 0000000..a621c69 --- /dev/null +++ b/ppdet/modeling/backbones/senet.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from .resnet import ResNet, Blocks, BasicBlock, BottleNeck + +__all__ = ['SENet', 'SERes5Head'] + + +@register +@serializable +class SENet(ResNet): + __shared__ = ['norm_type'] + + def __init__(self, + depth=50, + variant='b', + lr_mult_list=[1.0, 1.0, 1.0, 1.0], + groups=1, + base_width=64, + norm_type='bn', + norm_decay=0, + freeze_norm=True, + freeze_at=0, + return_idx=[0, 1, 2, 3], + dcn_v2_stages=[-1], + std_senet=True, + num_stages=4): + """ + Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507 + + Args: + depth (int): SENet depth, should be 50, 101, 152 + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), + lower learning rate ratio is need for pretrained model + got using distillation(default as [1.0, 1.0, 1.0, 1.0]). + groups (int): group convolution cardinality + base_width (int): base width of each group convolution + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + norm_decay (float): weight decay for normalization layer weights + freeze_norm (bool): freeze normalization layers + freeze_at (int): freeze the backbone at which stage + return_idx (list): index of the stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + std_senet (bool): whether use senet, default True + num_stages (int): total num of stages + """ + + super(SENet, self).__init__( + depth=depth, + variant=variant, + lr_mult_list=lr_mult_list, + ch_in=128, + groups=groups, + base_width=base_width, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + freeze_at=freeze_at, + return_idx=return_idx, + dcn_v2_stages=dcn_v2_stages, + std_senet=std_senet, + num_stages=num_stages) + + +@register +class SERes5Head(nn.Layer): + def __init__(self, + depth=50, + variant='b', + lr_mult=1.0, + groups=1, + base_width=64, + norm_type='bn', + norm_decay=0, + dcn_v2=False, + freeze_norm=False, + std_senet=True): + """ + SERes5Head layer + + Args: + depth (int): SENet depth, should be 50, 101, 152 + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + lr_mult (list): learning rate ratio of SERes5Head, default as 1.0. + groups (int): group convolution cardinality + base_width (int): base width of each group convolution + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + norm_decay (float): weight decay for normalization layer weights + dcn_v2_stages (list): index of stages who select deformable conv v2 + std_senet (bool): whether use senet, default True + + """ + super(SERes5Head, self).__init__() + ch_out = 512 + ch_in = 256 if depth < 50 else 1024 + na = NameAdapter(self) + block = BottleNeck if depth >= 50 else BasicBlock + self.res5 = Blocks( + block, + ch_in, + ch_out, + count=3, + name_adapter=na, + stage_num=5, + variant=variant, + groups=groups, + base_width=base_width, + lr=lr_mult, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + dcn_v2=dcn_v2, + std_senet=std_senet) + self.ch_out = ch_out * block.expansion + + @property + def out_shape(self): + return [ShapeSpec( + channels=self.ch_out, + stride=16, )] + + def forward(self, roi_feat): + y = self.res5(roi_feat) + return y diff --git a/ppdet/modeling/backbones/vgg.py b/ppdet/modeling/backbones/vgg.py new file mode 100644 index 0000000..dd03872 --- /dev/null +++ b/ppdet/modeling/backbones/vgg.py @@ -0,0 +1,216 @@ +from __future__ import division + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn import Conv2D, MaxPool2D +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['VGG'] + +VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]} + + +class ConvBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + groups, + pool_size=2, + pool_stride=2, + pool_padding=0, + name=None): + super(ConvBlock, self).__init__() + + self.groups = groups + self.conv0 = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(name=name + "1_weights"), + bias_attr=ParamAttr(name=name + "1_bias")) + self.conv_out_list = [] + for i in range(1, groups): + conv_out = self.add_sublayer( + 'conv{}'.format(i), + Conv2D( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr( + name=name + "{}_weights".format(i + 1)), + bias_attr=ParamAttr(name=name + "{}_bias".format(i + 1)))) + self.conv_out_list.append(conv_out) + + self.pool = MaxPool2D( + kernel_size=pool_size, + stride=pool_stride, + padding=pool_padding, + ceil_mode=True) + + def forward(self, inputs): + out = self.conv0(inputs) + out = F.relu(out) + for conv_i in self.conv_out_list: + out = conv_i(out) + out = F.relu(out) + pool = self.pool(out) + return out, pool + + +class ExtraBlock(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + padding, + stride, + kernel_size, + name=None): + super(ExtraBlock, self).__init__() + + self.conv0 = Conv2D( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0) + self.conv1 = Conv2D( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding) + + def forward(self, inputs): + out = self.conv0(inputs) + out = F.relu(out) + out = self.conv1(out) + out = F.relu(out) + return out + + +class L2NormScale(nn.Layer): + def __init__(self, num_channels, scale=1.0): + super(L2NormScale, self).__init__() + self.scale = self.create_parameter( + attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)), + shape=[num_channels]) + + def forward(self, inputs): + out = F.normalize(inputs, axis=1, epsilon=1e-10) + # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( + # out) * out + out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out + return out + + +@register +@serializable +class VGG(nn.Layer): + def __init__(self, + depth=16, + normalizations=[20., -1, -1, -1, -1, -1], + extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], + [128, 256, 0, 1, 3], + [128, 256, 0, 1, 3]]): + super(VGG, self).__init__() + + assert depth in [16, 19], \ + "depth as 16/19 supported currently, but got {}".format(depth) + self.depth = depth + self.groups = VGG_cfg[depth] + self.normalizations = normalizations + self.extra_block_filters = extra_block_filters + + self._out_channels = [] + + self.conv_block_0 = ConvBlock( + 3, 64, self.groups[0], 2, 2, 0, name="conv1_") + self.conv_block_1 = ConvBlock( + 64, 128, self.groups[1], 2, 2, 0, name="conv2_") + self.conv_block_2 = ConvBlock( + 128, 256, self.groups[2], 2, 2, 0, name="conv3_") + self.conv_block_3 = ConvBlock( + 256, 512, self.groups[3], 2, 2, 0, name="conv4_") + self.conv_block_4 = ConvBlock( + 512, 512, self.groups[4], 3, 1, 1, name="conv5_") + self._out_channels.append(512) + + self.fc6 = Conv2D( + in_channels=512, + out_channels=1024, + kernel_size=3, + stride=1, + padding=6, + dilation=6) + self.fc7 = Conv2D( + in_channels=1024, + out_channels=1024, + kernel_size=1, + stride=1, + padding=0) + self._out_channels.append(1024) + + # extra block + self.extra_convs = [] + last_channels = 1024 + for i, v in enumerate(self.extra_block_filters): + assert len(v) == 5, "extra_block_filters size not fix" + extra_conv = self.add_sublayer("conv{}".format(6 + i), + ExtraBlock(last_channels, v[0], v[1], + v[2], v[3], v[4])) + last_channels = v[1] + self.extra_convs.append(extra_conv) + self._out_channels.append(last_channels) + + self.norms = [] + for i, n in enumerate(self.normalizations): + if n != -1: + norm = self.add_sublayer("norm{}".format(i), + L2NormScale( + self.extra_block_filters[i][1], n)) + else: + norm = None + self.norms.append(norm) + + def forward(self, inputs): + outputs = [] + + conv, pool = self.conv_block_0(inputs['image']) + conv, pool = self.conv_block_1(pool) + conv, pool = self.conv_block_2(pool) + conv, pool = self.conv_block_3(pool) + outputs.append(conv) + + conv, pool = self.conv_block_4(pool) + out = self.fc6(pool) + out = F.relu(out) + out = self.fc7(out) + out = F.relu(out) + outputs.append(out) + + if not self.extra_block_filters: + return outputs + + # extra block + for extra_conv in self.extra_convs: + out = extra_conv(out) + outputs.append(out) + + for i, n in enumerate(self.normalizations): + if n != -1: + outputs[i] = self.norms[i](outputs[i]) + + return outputs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/bbox_utils.py b/ppdet/modeling/bbox_utils.py new file mode 100644 index 0000000..c77a5ae --- /dev/null +++ b/ppdet/modeling/bbox_utils.py @@ -0,0 +1,528 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn.functional as F +import math +import numpy as np + + +def bbox2delta(src_boxes, tgt_boxes, weights): + src_w = src_boxes[:, 2] - src_boxes[:, 0] + src_h = src_boxes[:, 3] - src_boxes[:, 1] + src_ctr_x = src_boxes[:, 0] + 0.5 * src_w + src_ctr_y = src_boxes[:, 1] + 0.5 * src_h + + tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] + tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] + tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w + tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h + + wx, wy, ww, wh = weights + dx = wx * (tgt_ctr_x - src_ctr_x) / src_w + dy = wy * (tgt_ctr_y - src_ctr_y) / src_h + dw = ww * paddle.log(tgt_w / src_w) + dh = wh * paddle.log(tgt_h / src_h) + + deltas = paddle.stack((dx, dy, dw, dh), axis=1) + return deltas + + +def delta2bbox(deltas, boxes, weights): + clip_scale = math.log(1000.0 / 16) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + # Prevent sending too large values into paddle.exp() + dw = paddle.clip(dw, max=clip_scale) + dh = paddle.clip(dh, max=clip_scale) + + pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) + pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) + pred_w = paddle.exp(dw) * widths.unsqueeze(1) + pred_h = paddle.exp(dh) * heights.unsqueeze(1) + + pred_boxes = [] + pred_boxes.append(pred_ctr_x - 0.5 * pred_w) + pred_boxes.append(pred_ctr_y - 0.5 * pred_h) + pred_boxes.append(pred_ctr_x + 0.5 * pred_w) + pred_boxes.append(pred_ctr_y + 0.5 * pred_h) + pred_boxes = paddle.stack(pred_boxes, axis=-1) + + return pred_boxes + + +def expand_bbox(bboxes, scale): + w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 + h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 + x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 + y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32) + bboxes_exp[:, 0] = x_c - w_half + bboxes_exp[:, 2] = x_c + w_half + bboxes_exp[:, 1] = y_c - h_half + bboxes_exp[:, 3] = y_c + h_half + + return bboxes_exp + + +def clip_bbox(boxes, im_shape): + h, w = im_shape[0], im_shape[1] + x1 = boxes[:, 0].clip(0, w) + y1 = boxes[:, 1].clip(0, h) + x2 = boxes[:, 2].clip(0, w) + y2 = boxes[:, 3].clip(0, h) + return paddle.stack([x1, y1, x2, y2], axis=1) + + +def nonempty_bbox(boxes, min_size=0, return_mask=False): + w = boxes[:, 2] - boxes[:, 0] + h = boxes[:, 3] - boxes[:, 1] + mask = paddle.logical_and(w > min_size, w > min_size) + if return_mask: + return mask + keep = paddle.nonzero(mask).flatten() + return keep + + +def bbox_area(boxes): + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def bbox_overlaps(boxes1, boxes2): + """ + Calculate overlaps between boxes1 and boxes2 + + Args: + boxes1 (Tensor): boxes with shape [M, 4] + boxes2 (Tensor): boxes with shape [N, 4] + + Return: + overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] + """ + area1 = bbox_area(boxes1) + area2 = bbox_area(boxes2) + + xy_max = paddle.minimum( + paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) + xy_min = paddle.maximum( + paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) + width_height = xy_max - xy_min + width_height = width_height.clip(min=0) + inter = width_height.prod(axis=2) + + overlaps = paddle.where(inter > 0, inter / + (paddle.unsqueeze(area1, 1) + area2 - inter), + paddle.zeros_like(inter)) + return overlaps + + +def xywh2xyxy(box): + x, y, w, h = box + x1 = x - w * 0.5 + y1 = y - h * 0.5 + x2 = x + w * 0.5 + y2 = y + h * 0.5 + return [x1, y1, x2, y2] + + +def make_grid(h, w, dtype): + yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)]) + return paddle.stack((xv, yv), 2).cast(dtype=dtype) + + +def decode_yolo(box, anchor, downsample_ratio): + """decode yolo box + + Args: + box (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + anchor (list): anchor with the shape [na, 2] + downsample_ratio (int): downsample ratio, default 32 + scale (float): scale, default 1. + + Return: + box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1] + """ + x, y, w, h = box + na, grid_h, grid_w = x.shape[1:4] + grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2)) + x1 = (x + grid[:, :, :, :, 0:1]) / grid_w + y1 = (y + grid[:, :, :, :, 1:2]) / grid_h + + anchor = paddle.to_tensor(anchor) + anchor = paddle.cast(anchor, x.dtype) + anchor = anchor.reshape((1, na, 1, 1, 2)) + w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w) + h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h) + + return [x1, y1, w1, h1] + + +def iou_similarity(box1, box2, eps=1e-9): + """Calculate iou of box1 and box2 + + Args: + box1 (Tensor): box with the shape [N, M1, 4] + box2 (Tensor): box with the shape [N, M2, 4] + + Return: + iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] + """ + box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4] + gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4] + x1y1 = paddle.maximum(px1y1, gx1y1) + x2y2 = paddle.minimum(px2y2, gx2y2) + overlap = (x2y2 - x1y1).clip(0).prod(-1) + area1 = (px2y2 - px1y1).clip(0).prod(-1) + area2 = (gx2y2 - gx1y1).clip(0).prod(-1) + union = area1 + area2 - overlap + eps + return overlap / union + + +def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): + """calculate the iou of box1 and box2 + + Args: + box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + giou (bool): whether use giou or not, default False + diou (bool): whether use diou or not, default False + ciou (bool): whether use ciou or not, default False + eps (float): epsilon to avoid divide by zero + + Return: + iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1] + """ + px1, py1, px2, py2 = box1 + gx1, gy1, gx2, gy2 = box2 + x1 = paddle.maximum(px1, gx1) + y1 = paddle.maximum(py1, gy1) + x2 = paddle.minimum(px2, gx2) + y2 = paddle.minimum(py2, gy2) + + overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0)) + + area1 = (px2 - px1) * (py2 - py1) + area1 = area1.clip(0) + + area2 = (gx2 - gx1) * (gy2 - gy1) + area2 = area2.clip(0) + + union = area1 + area2 - overlap + eps + iou = overlap / union + + if giou or ciou or diou: + # convex w, h + cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1) + ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1) + if giou: + c_area = cw * ch + eps + return iou - (c_area - union) / c_area + else: + # convex diagonal squared + c2 = cw**2 + ch**2 + eps + # center distance + rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4 + if diou: + return iou - rho2 / c2 + else: + w1, h1 = px2 - px1, py2 - py1 + eps + w2, h2 = gx2 - gx1, gy2 - gy1 + eps + delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2) + v = (4 / math.pi**2) * paddle.pow(delta, 2) + alpha = v / (1 + eps - iou + v) + alpha.stop_gradient = True + return iou - (rho2 / c2 + v * alpha) + else: + return iou + + +def rect2rbox(bboxes): + """ + :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax) + :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle) + """ + bboxes = bboxes.reshape(-1, 4) + num_boxes = bboxes.shape[0] + + x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0 + y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0 + edges1 = np.abs(bboxes[:, 2] - bboxes[:, 0]) + edges2 = np.abs(bboxes[:, 3] - bboxes[:, 1]) + angles = np.zeros([num_boxes], dtype=bboxes.dtype) + + inds = edges1 < edges2 + + rboxes = np.stack((x_ctr, y_ctr, edges1, edges2, angles), axis=1) + rboxes[inds, 2] = edges2[inds] + rboxes[inds, 3] = edges1[inds] + rboxes[inds, 4] = np.pi / 2.0 + return rboxes + + +def delta2rbox(Rrois, + deltas, + means=[0, 0, 0, 0, 0], + stds=[1, 1, 1, 1, 1], + wh_ratio_clip=1e-6): + """ + :param Rrois: (cx, cy, w, h, theta) + :param deltas: (dx, dy, dw, dh, dtheta) + :param means: + :param stds: + :param wh_ratio_clip: + :return: + """ + means = paddle.to_tensor(means) + stds = paddle.to_tensor(stds) + deltas = paddle.reshape(deltas, [-1, deltas.shape[-1]]) + denorm_deltas = deltas * stds + means + + dx = denorm_deltas[:, 0] + dy = denorm_deltas[:, 1] + dw = denorm_deltas[:, 2] + dh = denorm_deltas[:, 3] + dangle = denorm_deltas[:, 4] + + max_ratio = np.abs(np.log(wh_ratio_clip)) + dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) + dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) + + Rroi_x = Rrois[:, 0] + Rroi_y = Rrois[:, 1] + Rroi_w = Rrois[:, 2] + Rroi_h = Rrois[:, 3] + Rroi_angle = Rrois[:, 4] + + gx = dx * Rroi_w * paddle.cos(Rroi_angle) - dy * Rroi_h * paddle.sin( + Rroi_angle) + Rroi_x + gy = dx * Rroi_w * paddle.sin(Rroi_angle) + dy * Rroi_h * paddle.cos( + Rroi_angle) + Rroi_y + gw = Rroi_w * dw.exp() + gh = Rroi_h * dh.exp() + ga = np.pi * dangle + Rroi_angle + ga = (ga + np.pi / 4) % np.pi - np.pi / 4 + ga = paddle.to_tensor(ga) + + gw = paddle.to_tensor(gw, dtype='float32') + gh = paddle.to_tensor(gh, dtype='float32') + bboxes = paddle.stack([gx, gy, gw, gh, ga], axis=-1) + return bboxes + + +def rbox2delta(proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]): + """ + + Args: + proposals: + gt: + means: 1x5 + stds: 1x5 + + Returns: + + """ + proposals = proposals.astype(np.float64) + + PI = np.pi + + gt_widths = gt[..., 2] + gt_heights = gt[..., 3] + gt_angle = gt[..., 4] + + proposals_widths = proposals[..., 2] + proposals_heights = proposals[..., 3] + proposals_angle = proposals[..., 4] + + coord = gt[..., 0:2] - proposals[..., 0:2] + dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4]) + * coord[..., 1]) / proposals_widths + dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4]) + * coord[..., 1]) / proposals_heights + dw = np.log(gt_widths / proposals_widths) + dh = np.log(gt_heights / proposals_heights) + da = (gt_angle - proposals_angle) + + da = (da + PI / 4) % PI - PI / 4 + da /= PI + + deltas = np.stack([dx, dy, dw, dh, da], axis=-1) + means = np.array(means, dtype=deltas.dtype) + stds = np.array(stds, dtype=deltas.dtype) + deltas = (deltas - means) / stds + deltas = deltas.astype(np.float32) + return deltas + + +def bbox_decode(bbox_preds, + anchors, + means=[0, 0, 0, 0, 0], + stds=[1, 1, 1, 1, 1]): + """decode bbox from deltas + Args: + bbox_preds: [N,H,W,5] + anchors: [H*W,5] + return: + bboxes: [N,H,W,5] + """ + means = paddle.to_tensor(means) + stds = paddle.to_tensor(stds) + num_imgs, H, W, _ = bbox_preds.shape + bboxes_list = [] + for img_id in range(num_imgs): + bbox_pred = bbox_preds[img_id] + # bbox_pred.shape=[5,H,W] + bbox_delta = bbox_pred + anchors = paddle.to_tensor(anchors) + bboxes = delta2rbox( + anchors, bbox_delta, means, stds, wh_ratio_clip=1e-6) + bboxes = paddle.reshape(bboxes, [H, W, 5]) + bboxes_list.append(bboxes) + return paddle.stack(bboxes_list, axis=0) + + +def poly_to_rbox(polys): + """ + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + to + rotated_boxes:[x_ctr,y_ctr,w,h,angle] + """ + rotated_boxes = [] + for poly in polys: + poly = np.array(poly[:8], dtype=np.float32) + + pt1 = (poly[0], poly[1]) + pt2 = (poly[2], poly[3]) + pt3 = (poly[4], poly[5]) + pt4 = (poly[6], poly[7]) + + edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[ + 1]) * (pt1[1] - pt2[1])) + edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[ + 1]) * (pt2[1] - pt3[1])) + + width = max(edge1, edge2) + height = min(edge1, edge2) + + rbox_angle = 0 + if edge1 > edge2: + rbox_angle = np.arctan2( + np.float(pt2[1] - pt1[1]), np.float(pt2[0] - pt1[0])) + elif edge2 >= edge1: + rbox_angle = np.arctan2( + np.float(pt4[1] - pt1[1]), np.float(pt4[0] - pt1[0])) + + def norm_angle(angle, range=[-np.pi / 4, np.pi]): + return (angle - range[0]) % range[1] + range[0] + + rbox_angle = norm_angle(rbox_angle) + + x_ctr = np.float(pt1[0] + pt3[0]) / 2 + y_ctr = np.float(pt1[1] + pt3[1]) / 2 + rotated_box = np.array([x_ctr, y_ctr, width, height, rbox_angle]) + rotated_boxes.append(rotated_box) + ret_rotated_boxes = np.array(rotated_boxes) + assert ret_rotated_boxes.shape[1] == 5 + return ret_rotated_boxes + + +def cal_line_length(point1, point2): + import math + return math.sqrt( + math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) + + +def get_best_begin_point_single(coordinate): + x1, y1, x2, y2, x3, y3, x4, y4 = coordinate + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + xmax = max(x1, x2, x3, x4) + ymax = max(y1, y2, y3, y4) + combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], + [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], + [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], + [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] + dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] + force = 100000000.0 + force_flag = 0 + for i in range(4): + temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ + + cal_line_length(combinate[i][1], dst_coordinate[1]) \ + + cal_line_length(combinate[i][2], dst_coordinate[2]) \ + + cal_line_length(combinate[i][3], dst_coordinate[3]) + if temp_force < force: + force = temp_force + force_flag = i + if force_flag != 0: + pass + return np.array(combinate[force_flag]).reshape(8) + + +def rbox2poly_single(rrect): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + # rect 2x4 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + # poly + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + poly = get_best_begin_point_single(poly) + return poly + + +def rbox2poly(rrects): + """ + rrect:[x_ctr,y_ctr,w,h,angle] + to + poly:[x0,y0,x1,y1,x2,y2,x3,y3] + """ + polys = [] + for rrect in rrects: + x_ctr, y_ctr, width, height, angle = rrect[:5] + tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 + rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) + R = np.array([[np.cos(angle), -np.sin(angle)], + [np.sin(angle), np.cos(angle)]]) + poly = R.dot(rect) + x0, x1, x2, x3 = poly[0, :4] + x_ctr + y0, y1, y2, y3 = poly[1, :4] + y_ctr + poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) + poly = get_best_begin_point_single(poly) + polys.append(poly) + polys = np.array(polys) + return polys diff --git a/ppdet/modeling/heads/__init__.py b/ppdet/modeling/heads/__init__.py new file mode 100644 index 0000000..9263aa8 --- /dev/null +++ b/ppdet/modeling/heads/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import bbox_head +from . import mask_head +from . import yolo_head +from . import roi_extractor +from . import ssd_head +from . import fcos_head +from . import solov2_head +from . import ttf_head +from . import cascade_head +from . import face_head +from . import s2anet_head + +from .bbox_head import * +from .mask_head import * +from .yolo_head import * +from .roi_extractor import * +from .ssd_head import * +from .fcos_head import * +from .solov2_head import * +from .ttf_head import * +from .cascade_head import * +from .face_head import * +from .s2anet_head import * diff --git a/ppdet/modeling/heads/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..07ee313 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..a2dd652 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/bbox_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/bbox_head.cpython-38.pyc new file mode 100644 index 0000000..fd6d751 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/bbox_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/bbox_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/bbox_head.cpython-39.pyc new file mode 100644 index 0000000..32ccbbd Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/bbox_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/cascade_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/cascade_head.cpython-38.pyc new file mode 100644 index 0000000..5b6bc2f Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/cascade_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/cascade_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/cascade_head.cpython-39.pyc new file mode 100644 index 0000000..ca9faf7 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/cascade_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/face_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/face_head.cpython-38.pyc new file mode 100644 index 0000000..648d244 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/face_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/face_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/face_head.cpython-39.pyc new file mode 100644 index 0000000..7f4ebaf Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/face_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/fcos_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/fcos_head.cpython-38.pyc new file mode 100644 index 0000000..6ad0daa Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/fcos_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/fcos_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/fcos_head.cpython-39.pyc new file mode 100644 index 0000000..9d7d326 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/fcos_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/mask_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/mask_head.cpython-38.pyc new file mode 100644 index 0000000..23af4e5 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/mask_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/mask_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/mask_head.cpython-39.pyc new file mode 100644 index 0000000..8175462 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/mask_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-38.pyc new file mode 100644 index 0000000..3ea0242 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-39.pyc new file mode 100644 index 0000000..3ecfb41 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/roi_extractor.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-38.pyc new file mode 100644 index 0000000..fb75321 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-39.pyc new file mode 100644 index 0000000..fd623b8 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/s2anet_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/solov2_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/solov2_head.cpython-38.pyc new file mode 100644 index 0000000..dfc1f91 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/solov2_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/solov2_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/solov2_head.cpython-39.pyc new file mode 100644 index 0000000..4fd4c62 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/solov2_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/ssd_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/ssd_head.cpython-38.pyc new file mode 100644 index 0000000..330d8b2 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/ssd_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/ssd_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/ssd_head.cpython-39.pyc new file mode 100644 index 0000000..c1ac7c8 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/ssd_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/ttf_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/ttf_head.cpython-38.pyc new file mode 100644 index 0000000..9ffcb91 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/ttf_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/ttf_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/ttf_head.cpython-39.pyc new file mode 100644 index 0000000..e16f3b9 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/ttf_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/yolo_head.cpython-38.pyc b/ppdet/modeling/heads/__pycache__/yolo_head.cpython-38.pyc new file mode 100644 index 0000000..758b6cb Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/yolo_head.cpython-38.pyc differ diff --git a/ppdet/modeling/heads/__pycache__/yolo_head.cpython-39.pyc b/ppdet/modeling/heads/__pycache__/yolo_head.cpython-39.pyc new file mode 100644 index 0000000..6594de0 Binary files /dev/null and b/ppdet/modeling/heads/__pycache__/yolo_head.cpython-39.pyc differ diff --git a/ppdet/modeling/heads/bbox_head.py b/ppdet/modeling/heads/bbox_head.py new file mode 100644 index 0000000..0979637 --- /dev/null +++ b/ppdet/modeling/heads/bbox_head.py @@ -0,0 +1,368 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, create +from ppdet.modeling import ops + +from .roi_extractor import RoIAlign +from ..shape_spec import ShapeSpec +from ..bbox_utils import bbox2delta +from ppdet.modeling.layers import ConvNormLayer + +__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead'] + + +@register +class TwoFCHead(nn.Layer): + """ + RCNN bbox head with Two fc layers to extract feature + + Args: + in_channel (int): Input channel which can be derived by from_config + out_channel (int): Output channel + resolution (int): Resolution of input feature map, default 7 + """ + + def __init__(self, in_channel=256, out_channel=1024, resolution=7): + super(TwoFCHead, self).__init__() + self.in_channel = in_channel + self.out_channel = out_channel + fan = in_channel * resolution * resolution + self.fc6 = nn.Linear( + in_channel * resolution * resolution, + out_channel, + weight_attr=paddle.ParamAttr( + initializer=XavierUniform(fan_out=fan))) + + self.fc7 = nn.Linear( + out_channel, + out_channel, + weight_attr=paddle.ParamAttr(initializer=XavierUniform())) + + @classmethod + def from_config(cls, cfg, input_shape): + s = input_shape + s = s[0] if isinstance(s, (list, tuple)) else s + return {'in_channel': s.channels} + + @property + def out_shape(self): + return [ShapeSpec(channels=self.out_channel, )] + + def forward(self, rois_feat): + rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) + fc6 = self.fc6(rois_feat) + fc6 = F.relu(fc6) + fc7 = self.fc7(fc6) + fc7 = F.relu(fc7) + return fc7 + + +@register +class XConvNormHead(nn.Layer): + __shared__ = ['norm_type', 'freeze_norm'] + """ + RCNN bbox head with serveral convolution layers + + Args: + in_channel (int): Input channels which can be derived by from_config + num_convs (int): The number of conv layers + conv_dim (int): The number of channels for the conv layers + out_channel (int): Output channels + resolution (int): Resolution of input feature map + norm_type (string): Norm type, bn, gn, sync_bn are available, + default `gn` + freeze_norm (bool): Whether to freeze the norm + stage_name (string): Prefix name for conv layer, '' by default + """ + + def __init__(self, + in_channel=256, + num_convs=4, + conv_dim=256, + out_channel=1024, + resolution=7, + norm_type='gn', + freeze_norm=False, + stage_name=''): + super(XConvNormHead, self).__init__() + self.in_channel = in_channel + self.num_convs = num_convs + self.conv_dim = conv_dim + self.out_channel = out_channel + self.norm_type = norm_type + self.freeze_norm = freeze_norm + + self.bbox_head_convs = [] + fan = conv_dim * 3 * 3 + initializer = KaimingNormal(fan_in=fan) + for i in range(self.num_convs): + in_c = in_channel if i == 0 else conv_dim + head_conv_name = stage_name + 'bbox_head_conv{}'.format(i) + head_conv = self.add_sublayer( + head_conv_name, + ConvNormLayer( + ch_in=in_c, + ch_out=conv_dim, + filter_size=3, + stride=1, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + initializer=initializer)) + self.bbox_head_convs.append(head_conv) + + fan = conv_dim * resolution * resolution + self.fc6 = nn.Linear( + conv_dim * resolution * resolution, + out_channel, + weight_attr=paddle.ParamAttr( + initializer=XavierUniform(fan_out=fan)), + bias_attr=paddle.ParamAttr( + learning_rate=2., regularizer=L2Decay(0.))) + + @classmethod + def from_config(cls, cfg, input_shape): + s = input_shape + s = s[0] if isinstance(s, (list, tuple)) else s + return {'in_channel': s.channels} + + @property + def out_shape(self): + return [ShapeSpec(channels=self.out_channel, )] + + def forward(self, rois_feat): + for i in range(self.num_convs): + rois_feat = F.relu(self.bbox_head_convs[i](rois_feat)) + rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) + fc6 = F.relu(self.fc6(rois_feat)) + return fc6 + + +@register +class BBoxHead(nn.Layer): + __shared__ = ['num_classes'] + __inject__ = ['bbox_assigner', 'bbox_loss'] + """ + RCNN bbox head + + Args: + head (nn.Layer): Extract feature in bbox head + in_channel (int): Input channel after RoI extractor + roi_extractor (object): The module of RoI Extractor + bbox_assigner (object): The module of Box Assigner, label and sample the + box. + with_pool (bool): Whether to use pooling for the RoI feature. + num_classes (int): The number of classes + bbox_weight (List[float]): The weight to get the decode box + """ + + def __init__(self, + head, + in_channel, + roi_extractor=RoIAlign().__dict__, + bbox_assigner='BboxAssigner', + with_pool=False, + num_classes=80, + bbox_weight=[10., 10., 5., 5.], + bbox_loss=None): + super(BBoxHead, self).__init__() + self.head = head + self.roi_extractor = roi_extractor + if isinstance(roi_extractor, dict): + self.roi_extractor = RoIAlign(**roi_extractor) + self.bbox_assigner = bbox_assigner + + self.with_pool = with_pool + self.num_classes = num_classes + self.bbox_weight = bbox_weight + self.bbox_loss = bbox_loss + + self.bbox_score = nn.Linear( + in_channel, + self.num_classes + 1, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0.0, std=0.01))) + + self.bbox_delta = nn.Linear( + in_channel, + 4 * self.num_classes, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0.0, std=0.001))) + self.assigned_label = None + self.assigned_rois = None + + @classmethod + def from_config(cls, cfg, input_shape): + roi_pooler = cfg['roi_extractor'] + assert isinstance(roi_pooler, dict) + kwargs = RoIAlign.from_config(cfg, input_shape) + roi_pooler.update(kwargs) + kwargs = {'input_shape': input_shape} + head = create(cfg['head'], **kwargs) + return { + 'roi_extractor': roi_pooler, + 'head': head, + 'in_channel': head.out_shape[0].channels + } + + def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None): + """ + body_feats (list[Tensor]): Feature maps from backbone + rois (list[Tensor]): RoIs generated from RPN module + rois_num (Tensor): The number of RoIs in each image + inputs (dict{Tensor}): The ground-truth of image + """ + if self.training: + rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs) + self.assigned_rois = (rois, rois_num) + self.assigned_targets = targets + + rois_feat = self.roi_extractor(body_feats, rois, rois_num) + bbox_feat = self.head(rois_feat) + if self.with_pool: + feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1) + feat = paddle.squeeze(feat, axis=[2, 3]) + else: + feat = bbox_feat + scores = self.bbox_score(feat) + deltas = self.bbox_delta(feat) + + if self.training: + loss = self.get_loss(scores, deltas, targets, rois, + self.bbox_weight) + return loss, bbox_feat + else: + pred = self.get_prediction(scores, deltas) + return pred, self.head + + def get_loss(self, scores, deltas, targets, rois, bbox_weight): + """ + scores (Tensor): scores from bbox head outputs + deltas (Tensor): deltas from bbox head outputs + targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds + rois (List[Tensor]): RoIs generated in each batch + """ + # TODO: better pass args + tgt_labels, tgt_bboxes, tgt_gt_inds = targets + tgt_labels = paddle.concat(tgt_labels) if len( + tgt_labels) > 1 else tgt_labels[0] + tgt_labels = tgt_labels.cast('int64') + tgt_labels.stop_gradient = True + loss_bbox_cls = F.cross_entropy( + input=scores, label=tgt_labels, reduction='mean') + # bbox reg + + cls_agnostic_bbox_reg = deltas.shape[1] == 4 + + fg_inds = paddle.nonzero( + paddle.logical_and(tgt_labels >= 0, tgt_labels < + self.num_classes)).flatten() + + cls_name = 'loss_bbox_cls' + reg_name = 'loss_bbox_reg' + loss_bbox = {} + + loss_weight = 1. + if fg_inds.numel() == 0: + fg_inds = paddle.zeros([1], dtype='int32') + loss_weight = 0. + + if cls_agnostic_bbox_reg: + reg_delta = paddle.gather(deltas, fg_inds) + else: + fg_gt_classes = paddle.gather(tgt_labels, fg_inds) + + reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1) + reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1]) + + reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4) + + reg_col_inds = reg_col_inds.reshape([-1, 1]) + reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1) + + reg_delta = paddle.gather(deltas, fg_inds) + reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4]) + rois = paddle.concat(rois) if len(rois) > 1 else rois[0] + tgt_bboxes = paddle.concat(tgt_bboxes) if len( + tgt_bboxes) > 1 else tgt_bboxes[0] + + reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight) + reg_target = paddle.gather(reg_target, fg_inds) + reg_target.stop_gradient = True + + if self.bbox_loss is not None: + reg_delta = self.bbox_transform(reg_delta) + reg_target = self.bbox_transform(reg_target) + loss_bbox_reg = self.bbox_loss( + reg_delta, reg_target).sum() / tgt_labels.shape[0] + loss_bbox_reg *= self.num_classes + else: + loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum( + ) / tgt_labels.shape[0] + + loss_bbox[cls_name] = loss_bbox_cls * loss_weight + loss_bbox[reg_name] = loss_bbox_reg * loss_weight + + return loss_bbox + + def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]): + wx, wy, ww, wh = weights + + deltas = paddle.reshape(deltas, shape=(0, -1, 4)) + + dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx + dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy + dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww + dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh + + dw = paddle.clip(dw, -1.e10, np.log(1000. / 16)) + dh = paddle.clip(dh, -1.e10, np.log(1000. / 16)) + + pred_ctr_x = dx + pred_ctr_y = dy + pred_w = paddle.exp(dw) + pred_h = paddle.exp(dh) + + x1 = pred_ctr_x - 0.5 * pred_w + y1 = pred_ctr_y - 0.5 * pred_h + x2 = pred_ctr_x + 0.5 * pred_w + y2 = pred_ctr_y + 0.5 * pred_h + + x1 = paddle.reshape(x1, shape=(-1, )) + y1 = paddle.reshape(y1, shape=(-1, )) + x2 = paddle.reshape(x2, shape=(-1, )) + y2 = paddle.reshape(y2, shape=(-1, )) + + return paddle.concat([x1, y1, x2, y2]) + + def get_prediction(self, score, delta): + bbox_prob = F.softmax(score) + return delta, bbox_prob + + def get_head(self, ): + return self.head + + def get_assigned_targets(self, ): + return self.assigned_targets + + def get_assigned_rois(self, ): + return self.assigned_rois diff --git a/ppdet/modeling/heads/cascade_head.py b/ppdet/modeling/heads/cascade_head.py new file mode 100644 index 0000000..aed5966 --- /dev/null +++ b/ppdet/modeling/heads/cascade_head.py @@ -0,0 +1,284 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, XavierUniform +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, create +from ppdet.modeling import ops + +from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead +from .roi_extractor import RoIAlign +from ..shape_spec import ShapeSpec +from ..bbox_utils import bbox2delta, delta2bbox, clip_bbox, nonempty_bbox + +__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead'] + + +@register +class CascadeTwoFCHead(nn.Layer): + __shared__ = ['num_cascade_stage'] + """ + Cascade RCNN bbox head with Two fc layers to extract feature + + Args: + in_channel (int): Input channel which can be derived by from_config + out_channel (int): Output channel + resolution (int): Resolution of input feature map, default 7 + num_cascade_stage (int): The number of cascade stage, default 3 + """ + + def __init__(self, + in_channel=256, + out_channel=1024, + resolution=7, + num_cascade_stage=3): + super(CascadeTwoFCHead, self).__init__() + + self.in_channel = in_channel + self.out_channel = out_channel + + self.head_list = [] + for stage in range(num_cascade_stage): + head_per_stage = self.add_sublayer( + str(stage), TwoFCHead(in_channel, out_channel, resolution)) + self.head_list.append(head_per_stage) + + @classmethod + def from_config(cls, cfg, input_shape): + s = input_shape + s = s[0] if isinstance(s, (list, tuple)) else s + return {'in_channel': s.channels} + + @property + def out_shape(self): + return [ShapeSpec(channels=self.out_channel, )] + + def forward(self, rois_feat, stage=0): + out = self.head_list[stage](rois_feat) + return out + + +@register +class CascadeXConvNormHead(nn.Layer): + __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage'] + """ + Cascade RCNN bbox head with serveral convolution layers + + Args: + in_channel (int): Input channels which can be derived by from_config + num_convs (int): The number of conv layers + conv_dim (int): The number of channels for the conv layers + out_channel (int): Output channels + resolution (int): Resolution of input feature map + norm_type (string): Norm type, bn, gn, sync_bn are available, + default `gn` + freeze_norm (bool): Whether to freeze the norm + num_cascade_stage (int): The number of cascade stage, default 3 + """ + + def __init__(self, + in_channel=256, + num_convs=4, + conv_dim=256, + out_channel=1024, + resolution=7, + norm_type='gn', + freeze_norm=False, + num_cascade_stage=3): + super(CascadeXConvNormHead, self).__init__() + self.in_channel = in_channel + self.out_channel = out_channel + + self.head_list = [] + for stage in range(num_cascade_stage): + head_per_stage = self.add_sublayer( + str(stage), + XConvNormHead( + in_channel, + num_convs, + conv_dim, + out_channel, + resolution, + norm_type, + freeze_norm, + stage_name='stage{}_'.format(stage))) + self.head_list.append(head_per_stage) + + @classmethod + def from_config(cls, cfg, input_shape): + s = input_shape + s = s[0] if isinstance(s, (list, tuple)) else s + return {'in_channel': s.channels} + + @property + def out_shape(self): + return [ShapeSpec(channels=self.out_channel, )] + + def forward(self, rois_feat, stage=0): + out = self.head_list[stage](rois_feat) + return out + + +@register +class CascadeHead(BBoxHead): + __shared__ = ['num_classes', 'num_cascade_stages'] + __inject__ = ['bbox_assigner', 'bbox_loss'] + """ + Cascade RCNN bbox head + + Args: + head (nn.Layer): Extract feature in bbox head + in_channel (int): Input channel after RoI extractor + roi_extractor (object): The module of RoI Extractor + bbox_assigner (object): The module of Box Assigner, label and sample the + box. + num_classes (int): The number of classes + bbox_weight (List[List[float]]): The weight to get the decode box and the + length of weight is the number of cascade stage + num_cascade_stages (int): THe number of stage to refine the box + """ + + def __init__(self, + head, + in_channel, + roi_extractor=RoIAlign().__dict__, + bbox_assigner='BboxAssigner', + num_classes=80, + bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0], + [30.0, 30.0, 15.0, 15.0]], + num_cascade_stages=3, + bbox_loss=None): + nn.Layer.__init__(self, ) + self.head = head + self.roi_extractor = roi_extractor + if isinstance(roi_extractor, dict): + self.roi_extractor = RoIAlign(**roi_extractor) + self.bbox_assigner = bbox_assigner + + self.num_classes = num_classes + self.bbox_weight = bbox_weight + self.num_cascade_stages = num_cascade_stages + self.bbox_loss = bbox_loss + + self.bbox_score_list = [] + self.bbox_delta_list = [] + for i in range(num_cascade_stages): + score_name = 'bbox_score_stage{}'.format(i) + delta_name = 'bbox_delta_stage{}'.format(i) + bbox_score = self.add_sublayer( + score_name, + nn.Linear( + in_channel, + self.num_classes + 1, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0.0, std=0.01)))) + + bbox_delta = self.add_sublayer( + delta_name, + nn.Linear( + in_channel, + 4, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0.0, std=0.001)))) + self.bbox_score_list.append(bbox_score) + self.bbox_delta_list.append(bbox_delta) + self.assigned_label = None + self.assigned_rois = None + + def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None): + """ + body_feats (list[Tensor]): Feature maps from backbone + rois (Tensor): RoIs generated from RPN module + rois_num (Tensor): The number of RoIs in each image + inputs (dict{Tensor}): The ground-truth of image + """ + targets = [] + if self.training: + rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs) + targets_list = [targets] + self.assigned_rois = (rois, rois_num) + self.assigned_targets = targets + + pred_bbox = None + head_out_list = [] + for i in range(self.num_cascade_stages): + if i > 0: + rois, rois_num = self._get_rois_from_boxes(pred_bbox, + inputs['im_shape']) + if self.training: + rois, rois_num, targets = self.bbox_assigner( + rois, rois_num, inputs, i, is_cascade=True) + targets_list.append(targets) + + rois_feat = self.roi_extractor(body_feats, rois, rois_num) + bbox_feat = self.head(rois_feat, i) + scores = self.bbox_score_list[i](bbox_feat) + deltas = self.bbox_delta_list[i](bbox_feat) + head_out_list.append([scores, deltas, rois]) + pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i]) + + if self.training: + loss = {} + for stage, value in enumerate(zip(head_out_list, targets_list)): + (scores, deltas, rois), targets = value + loss_stage = self.get_loss(scores, deltas, targets, rois, + self.bbox_weight[stage]) + for k, v in loss_stage.items(): + loss[k + "_stage{}".format( + stage)] = v / self.num_cascade_stages + + return loss, bbox_feat + else: + scores, deltas, self.refined_rois = self.get_prediction( + head_out_list) + return (deltas, scores), self.head + + def _get_rois_from_boxes(self, boxes, im_shape): + rois = [] + for i, boxes_per_image in enumerate(boxes): + clip_box = clip_bbox(boxes_per_image, im_shape[i]) + if self.training: + keep = nonempty_bbox(clip_box) + if keep.shape[0] == 0: + keep = paddle.zeros([1], dtype='int32') + clip_box = paddle.gather(clip_box, keep) + rois.append(clip_box) + rois_num = paddle.concat([paddle.shape(r)[0] for r in rois]) + return rois, rois_num + + def _get_pred_bbox(self, deltas, proposals, weights): + pred_proposals = paddle.concat(proposals) if len( + proposals) > 1 else proposals[0] + pred_bbox = delta2bbox(deltas, pred_proposals, weights) + pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]]) + num_prop = [p.shape[0] for p in proposals] + return pred_bbox.split(num_prop) + + def get_prediction(self, head_out_list): + """ + head_out_list(List[Tensor]): scores, deltas, rois + """ + pred_list = [] + scores_list = [F.softmax(head[0]) for head in head_out_list] + scores = paddle.add_n(scores_list) / self.num_cascade_stages + # Get deltas and rois from the last stage + _, deltas, rois = head_out_list[-1] + return scores, deltas, rois + + def get_refined_rois(self, ): + return self.refined_rois diff --git a/ppdet/modeling/heads/face_head.py b/ppdet/modeling/heads/face_head.py new file mode 100644 index 0000000..937f30d --- /dev/null +++ b/ppdet/modeling/heads/face_head.py @@ -0,0 +1,113 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from paddle.regularizer import L2Decay +from paddle import ParamAttr + +from ..layers import AnchorGeneratorSSD + + +@register +class FaceHead(nn.Layer): + """ + Head block for Face detection network + + Args: + num_classes (int): Number of output classes. + in_channels (int): Number of input channels. + anchor_generator(object): instance of anchor genertor method. + kernel_size (int): kernel size of Conv2D in FaceHead. + padding (int): padding of Conv2D in FaceHead. + conv_decay (float): norm_decay (float): weight decay for conv layer weights. + loss (object): loss of face detection model. + """ + __shared__ = ['num_classes'] + __inject__ = ['anchor_generator', 'loss'] + + def __init__(self, + num_classes=80, + in_channels=(96, 96), + anchor_generator=AnchorGeneratorSSD().__dict__, + kernel_size=3, + padding=1, + conv_decay=0., + loss='SSDLoss'): + super(FaceHead, self).__init__() + # add background class + self.num_classes = num_classes + 1 + self.in_channels = in_channels + self.anchor_generator = anchor_generator + self.loss = loss + + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) + + self.num_priors = self.anchor_generator.num_priors + self.box_convs = [] + self.score_convs = [] + for i, num_prior in enumerate(self.num_priors): + box_conv_name = "boxes{}".format(i) + box_conv = self.add_sublayer( + box_conv_name, + nn.Conv2D( + in_channels=in_channels[i], + out_channels=num_prior * 4, + kernel_size=kernel_size, + padding=padding)) + self.box_convs.append(box_conv) + + score_conv_name = "scores{}".format(i) + score_conv = self.add_sublayer( + score_conv_name, + nn.Conv2D( + in_channels=in_channels[i], + out_channels=num_prior * self.num_classes, + kernel_size=kernel_size, + padding=padding)) + self.score_convs.append(score_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + def forward(self, feats, image, gt_bbox=None, gt_class=None): + box_preds = [] + cls_scores = [] + prior_boxes = [] + for feat, box_conv, score_conv in zip(feats, self.box_convs, + self.score_convs): + box_pred = box_conv(feat) + box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) + box_pred = paddle.reshape(box_pred, [0, -1, 4]) + box_preds.append(box_pred) + + cls_score = score_conv(feat) + cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) + cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) + cls_scores.append(cls_score) + + prior_boxes = self.anchor_generator(feats, image) + + if self.training: + return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, + prior_boxes) + else: + return (box_preds, cls_scores), prior_boxes + + def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): + return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) diff --git a/ppdet/modeling/heads/fcos_head.py b/ppdet/modeling/heads/fcos_head.py new file mode 100644 index 0000000..3b8fd7f --- /dev/null +++ b/ppdet/modeling/heads/fcos_head.py @@ -0,0 +1,269 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Normal, Constant + +from ppdet.core.workspace import register +from ppdet.modeling.layers import ConvNormLayer + + +class ScaleReg(nn.Layer): + """ + Parameter for scaling the regression outputs. + """ + + def __init__(self): + super(ScaleReg, self).__init__() + self.scale_reg = self.create_parameter( + shape=[1], + attr=ParamAttr(initializer=Constant(value=1.)), + dtype="float32") + + def forward(self, inputs): + out = inputs * self.scale_reg + return out + + +@register +class FCOSFeat(nn.Layer): + """ + FCOSFeat of FCOS + + Args: + feat_in (int): The channel number of input Tensor. + feat_out (int): The channel number of output Tensor. + num_convs (int): The convolution number of the FCOSFeat. + norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. + use_dcn (bool): Whether to use dcn in tower or not. + """ + + def __init__(self, + feat_in=256, + feat_out=256, + num_convs=4, + norm_type='bn', + use_dcn=False): + super(FCOSFeat, self).__init__() + self.num_convs = num_convs + self.norm_type = norm_type + self.cls_subnet_convs = [] + self.reg_subnet_convs = [] + for i in range(self.num_convs): + in_c = feat_in if i == 0 else feat_out + + cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i) + cls_conv = self.add_sublayer( + cls_conv_name, + ConvNormLayer( + ch_in=in_c, + ch_out=feat_out, + filter_size=3, + stride=1, + norm_type=norm_type, + use_dcn=use_dcn, + bias_on=True, + lr_scale=2.)) + self.cls_subnet_convs.append(cls_conv) + + reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i) + reg_conv = self.add_sublayer( + reg_conv_name, + ConvNormLayer( + ch_in=in_c, + ch_out=feat_out, + filter_size=3, + stride=1, + norm_type=norm_type, + use_dcn=use_dcn, + bias_on=True, + lr_scale=2.)) + self.reg_subnet_convs.append(reg_conv) + + def forward(self, fpn_feat): + cls_feat = fpn_feat + reg_feat = fpn_feat + for i in range(self.num_convs): + cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat)) + reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat)) + return cls_feat, reg_feat + + +@register +class FCOSHead(nn.Layer): + """ + FCOSHead + Args: + fcos_feat (object): Instance of 'FCOSFeat' + num_classes (int): Number of classes + fpn_stride (list): The stride of each FPN Layer + prior_prob (float): Used to set the bias init for the class prediction layer + fcos_loss (object): Instance of 'FCOSLoss' + norm_reg_targets (bool): Normalization the regression target if true + centerness_on_reg (bool): The prediction of centerness on regression or clssification branch + """ + __inject__ = ['fcos_feat', 'fcos_loss'] + __shared__ = ['num_classes'] + + def __init__(self, + fcos_feat, + num_classes=80, + fpn_stride=[8, 16, 32, 64, 128], + prior_prob=0.01, + fcos_loss='FCOSLoss', + norm_reg_targets=True, + centerness_on_reg=True): + super(FCOSHead, self).__init__() + self.fcos_feat = fcos_feat + self.num_classes = num_classes + self.fpn_stride = fpn_stride + self.prior_prob = prior_prob + self.fcos_loss = fcos_loss + self.norm_reg_targets = norm_reg_targets + self.centerness_on_reg = centerness_on_reg + + conv_cls_name = "fcos_head_cls" + bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) + self.fcos_head_cls = self.add_sublayer( + conv_cls_name, + nn.Conv2D( + in_channels=256, + out_channels=self.num_classes, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr( + name=conv_cls_name + "_weights", + initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr( + name=conv_cls_name + "_bias", + initializer=Constant(value=bias_init_value)))) + + conv_reg_name = "fcos_head_reg" + self.fcos_head_reg = self.add_sublayer( + conv_reg_name, + nn.Conv2D( + in_channels=256, + out_channels=4, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr( + name=conv_reg_name + "_weights", + initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr( + name=conv_reg_name + "_bias", + initializer=Constant(value=0)))) + + conv_centerness_name = "fcos_head_centerness" + self.fcos_head_centerness = self.add_sublayer( + conv_centerness_name, + nn.Conv2D( + in_channels=256, + out_channels=1, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr( + name=conv_centerness_name + "_weights", + initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr( + name=conv_centerness_name + "_bias", + initializer=Constant(value=0)))) + + self.scales_regs = [] + for i in range(len(self.fpn_stride)): + lvl = int(math.log(int(self.fpn_stride[i]), 2)) + feat_name = 'p{}_feat'.format(lvl) + scale_reg = self.add_sublayer(feat_name, ScaleReg()) + self.scales_regs.append(scale_reg) + + def _compute_locations_by_level(self, fpn_stride, feature): + """ + Compute locations of anchor points of each FPN layer + Args: + fpn_stride (int): The stride of current FPN feature map + feature (Tensor): Tensor of current FPN feature map + Return: + Anchor points locations of current FPN feature map + """ + shape_fm = paddle.shape(feature) + shape_fm.stop_gradient = True + h, w = shape_fm[2], shape_fm[3] + shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) + shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) + shift_x = paddle.unsqueeze(shift_x, axis=0) + shift_y = paddle.unsqueeze(shift_y, axis=1) + shift_x = paddle.expand(shift_x, shape=[h, w]) + shift_y = paddle.expand(shift_y, shape=[h, w]) + shift_x.stop_gradient = True + shift_y.stop_gradient = True + shift_x = paddle.reshape(shift_x, shape=[-1]) + shift_y = paddle.reshape(shift_y, shape=[-1]) + location = paddle.stack( + [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2 + location.stop_gradient = True + return location + + def forward(self, fpn_feats, is_training): + assert len(fpn_feats) == len( + self.fpn_stride + ), "The size of fpn_feats is not equal to size of fpn_stride" + cls_logits_list = [] + bboxes_reg_list = [] + centerness_list = [] + for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs, + self.fpn_stride, fpn_feats): + fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat) + cls_logits = self.fcos_head_cls(fcos_cls_feat) + bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat)) + if self.centerness_on_reg: + centerness = self.fcos_head_centerness(fcos_reg_feat) + else: + centerness = self.fcos_head_centerness(fcos_cls_feat) + if self.norm_reg_targets: + bbox_reg = F.relu(bbox_reg) + if not is_training: + bbox_reg = bbox_reg * fpn_stride + else: + bbox_reg = paddle.exp(bbox_reg) + cls_logits_list.append(cls_logits) + bboxes_reg_list.append(bbox_reg) + centerness_list.append(centerness) + + if not is_training: + locations_list = [] + for fpn_stride, feature in zip(self.fpn_stride, fpn_feats): + location = self._compute_locations_by_level(fpn_stride, feature) + locations_list.append(location) + + return locations_list, cls_logits_list, bboxes_reg_list, centerness_list + else: + return cls_logits_list, bboxes_reg_list, centerness_list + + def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness): + cls_logits, bboxes_reg, centerness = fcos_head_outs + return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels, + tag_bboxes, tag_centerness) diff --git a/ppdet/modeling/heads/mask_head.py b/ppdet/modeling/heads/mask_head.py new file mode 100644 index 0000000..e5df8d2 --- /dev/null +++ b/ppdet/modeling/heads/mask_head.py @@ -0,0 +1,250 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import KaimingNormal +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, create +from ppdet.modeling import ops +from ppdet.modeling.layers import ConvNormLayer + +from .roi_extractor import RoIAlign + + +@register +class MaskFeat(nn.Layer): + """ + Feature extraction in Mask head + + Args: + in_channel (int): Input channels + out_channel (int): Output channels + num_convs (int): The number of conv layers, default 4 + norm_type (string | None): Norm type, bn, gn, sync_bn are available, + default None + """ + + def __init__(self, + in_channel=256, + out_channel=256, + num_convs=4, + norm_type=None): + super(MaskFeat, self).__init__() + self.num_convs = num_convs + self.in_channel = in_channel + self.out_channel = out_channel + self.norm_type = norm_type + fan_conv = out_channel * 3 * 3 + fan_deconv = out_channel * 2 * 2 + + mask_conv = nn.Sequential() + if norm_type == 'gn': + for i in range(self.num_convs): + conv_name = 'mask_inter_feat_{}'.format(i + 1) + mask_conv.add_sublayer( + conv_name, + ConvNormLayer( + ch_in=in_channel if i == 0 else out_channel, + ch_out=out_channel, + filter_size=3, + stride=1, + norm_type=self.norm_type, + initializer=KaimingNormal(fan_in=fan_conv))) + mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) + else: + for i in range(self.num_convs): + conv_name = 'mask_inter_feat_{}'.format(i + 1) + conv = nn.Conv2D( + in_channels=in_channel if i == 0 else out_channel, + out_channels=out_channel, + kernel_size=3, + padding=1, + weight_attr=paddle.ParamAttr( + initializer=KaimingNormal(fan_in=fan_conv))) + mask_conv.add_sublayer(conv_name, conv) + mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) + mask_conv.add_sublayer( + 'conv5_mask', + nn.Conv2DTranspose( + in_channels=self.in_channel, + out_channels=self.out_channel, + kernel_size=2, + stride=2, + weight_attr=paddle.ParamAttr( + initializer=KaimingNormal(fan_in=fan_deconv)))) + mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU()) + self.upsample = mask_conv + + @classmethod + def from_config(cls, cfg, input_shape): + if isinstance(input_shape, (list, tuple)): + input_shape = input_shape[0] + return {'in_channel': input_shape.channels, } + + def out_channels(self): + return self.out_channel + + def forward(self, feats): + return self.upsample(feats) + + +@register +class MaskHead(nn.Layer): + __shared__ = ['num_classes'] + __inject__ = ['mask_assigner'] + """ + RCNN mask head + + Args: + head (nn.Layer): Extract feature in mask head + roi_extractor (object): The module of RoI Extractor + mask_assigner (object): The module of Mask Assigner, + label and sample the mask + num_classes (int): The number of classes + share_bbox_feat (bool): Whether to share the feature from bbox head, + default false + """ + + def __init__(self, + head, + roi_extractor=RoIAlign().__dict__, + mask_assigner='MaskAssigner', + num_classes=80, + share_bbox_feat=False): + super(MaskHead, self).__init__() + self.num_classes = num_classes + + self.roi_extractor = roi_extractor + if isinstance(roi_extractor, dict): + self.roi_extractor = RoIAlign(**roi_extractor) + self.head = head + self.in_channels = head.out_channels() + self.mask_assigner = mask_assigner + self.share_bbox_feat = share_bbox_feat + self.bbox_head = None + + self.mask_fcn_logits = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.num_classes, + kernel_size=1, + weight_attr=paddle.ParamAttr(initializer=KaimingNormal( + fan_in=self.num_classes))) + + @classmethod + def from_config(cls, cfg, input_shape): + roi_pooler = cfg['roi_extractor'] + assert isinstance(roi_pooler, dict) + kwargs = RoIAlign.from_config(cfg, input_shape) + roi_pooler.update(kwargs) + kwargs = {'input_shape': input_shape} + head = create(cfg['head'], **kwargs) + return { + 'roi_extractor': roi_pooler, + 'head': head, + } + + def get_loss(self, mask_logits, mask_label, mask_target, mask_weight): + mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3]) + mask_label = paddle.expand_as(mask_label, mask_logits) + mask_label.stop_gradient = True + mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label)) + shape = mask_logits.shape + mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]]) + + mask_target = mask_target.cast('float32') + mask_weight = mask_weight.unsqueeze([1, 2]) + loss_mask = F.binary_cross_entropy_with_logits( + mask_pred, mask_target, weight=mask_weight, reduction="mean") + return loss_mask + + def forward_train(self, body_feats, rois, rois_num, inputs, targets, + bbox_feat): + """ + body_feats (list[Tensor]): Multi-level backbone features + rois (list[Tensor]): Proposals for each batch with shape [N, 4] + rois_num (Tensor): The number of proposals for each batch + inputs (dict): ground truth info + """ + tgt_labels, _, tgt_gt_inds = targets + rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner( + rois, tgt_labels, tgt_gt_inds, inputs) + + if self.share_bbox_feat: + rois_feat = paddle.gather(bbox_feat, mask_index) + else: + rois_feat = self.roi_extractor(body_feats, rois, rois_num) + mask_feat = self.head(rois_feat) + mask_logits = self.mask_fcn_logits(mask_feat) + + loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks, + tgt_weights) + return {'loss_mask': loss_mask} + + def forward_test(self, + body_feats, + rois, + rois_num, + scale_factor, + feat_func=None): + """ + body_feats (list[Tensor]): Multi-level backbone features + rois (Tensor): Prediction from bbox head with shape [N, 6] + rois_num (Tensor): The number of prediction for each batch + scale_factor (Tensor): The scale factor from origin size to input size + """ + if rois.shape[0] == 0: + mask_out = paddle.full([1, 1, 1, 1], -1) + else: + bbox = [rois[:, 2:]] + labels = rois[:, 0].cast('int32') + rois_feat = self.roi_extractor(body_feats, bbox, rois_num) + if self.share_bbox_feat: + assert feat_func is not None + rois_feat = feat_func(rois_feat) + + mask_feat = self.head(rois_feat) + mask_logit = self.mask_fcn_logits(mask_feat) + mask_num_class = mask_logit.shape[1] + if mask_num_class == 1: + mask_out = F.sigmoid(mask_logit) + else: + num_masks = mask_logit.shape[0] + mask_out = [] + # TODO: need to optimize gather + for i in range(mask_logit.shape[0]): + pred_masks = paddle.unsqueeze( + mask_logit[i, :, :, :], axis=0) + mask = paddle.gather(pred_masks, labels[i], axis=1) + mask_out.append(mask) + mask_out = F.sigmoid(paddle.concat(mask_out)) + return mask_out + + def forward(self, + body_feats, + rois, + rois_num, + inputs, + targets=None, + bbox_feat=None, + feat_func=None): + if self.training: + return self.forward_train(body_feats, rois, rois_num, inputs, + targets, bbox_feat) + else: + im_scale = inputs['scale_factor'] + return self.forward_test(body_feats, rois, rois_num, im_scale, + feat_func) diff --git a/ppdet/modeling/heads/roi_extractor.py b/ppdet/modeling/heads/roi_extractor.py new file mode 100644 index 0000000..35c3924 --- /dev/null +++ b/ppdet/modeling/heads/roi_extractor.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from ppdet.core.workspace import register +from ppdet.modeling import ops + + +def _to_list(v): + if not isinstance(v, (list, tuple)): + return [v] + return v + + +@register +class RoIAlign(object): + """ + RoI Align module + + For more details, please refer to the document of roi_align in + in ppdet/modeing/ops.py + + Args: + resolution (int): The output size, default 14 + spatial_scale (float): Multiplicative spatial scale factor to translate + ROI coords from their input scale to the scale used when pooling. + default 0.0625 + sampling_ratio (int): The number of sampling points in the interpolation + grid, default 0 + canconical_level (int): The referring level of FPN layer with + specified level. default 4 + canonical_size (int): The referring scale of FPN layer with + specified scale. default 224 + start_level (int): The start level of FPN layer to extract RoI feature, + default 0 + end_level (int): The end level of FPN layer to extract RoI feature, + default 3 + aligned (bool): Whether to add offset to rois' coord in roi_align. + default false + """ + + def __init__(self, + resolution=14, + spatial_scale=0.0625, + sampling_ratio=0, + canconical_level=4, + canonical_size=224, + start_level=0, + end_level=3, + aligned=False): + super(RoIAlign, self).__init__() + self.resolution = resolution + self.spatial_scale = _to_list(spatial_scale) + self.sampling_ratio = sampling_ratio + self.canconical_level = canconical_level + self.canonical_size = canonical_size + self.start_level = start_level + self.end_level = end_level + self.aligned = aligned + + @classmethod + def from_config(cls, cfg, input_shape): + return {'spatial_scale': [1. / i.stride for i in input_shape]} + + def __call__(self, feats, roi, rois_num): + roi = paddle.concat(roi) if len(roi) > 1 else roi[0] + if len(feats) == 1: + rois_feat = ops.roi_align( + feats[self.start_level], + roi, + self.resolution, + self.spatial_scale[0], + rois_num=rois_num, + aligned=self.aligned) + else: + offset = 2 + k_min = self.start_level + offset + k_max = self.end_level + offset + rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals( + roi, + k_min, + k_max, + self.canconical_level, + self.canonical_size, + rois_num=rois_num) + rois_feat_list = [] + for lvl in range(self.start_level, self.end_level + 1): + roi_feat = ops.roi_align( + feats[lvl], + rois_dist[lvl], + self.resolution, + self.spatial_scale[lvl], + sampling_ratio=self.sampling_ratio, + rois_num=rois_num_dist[lvl], + aligned=self.aligned) + rois_feat_list.append(roi_feat) + rois_feat_shuffle = paddle.concat(rois_feat_list) + rois_feat = paddle.gather(rois_feat_shuffle, restore_index) + + return rois_feat diff --git a/ppdet/modeling/heads/s2anet_head.py b/ppdet/modeling/heads/s2anet_head.py new file mode 100644 index 0000000..12e0c31 --- /dev/null +++ b/ppdet/modeling/heads/s2anet_head.py @@ -0,0 +1,872 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, Constant +from ppdet.core.workspace import register +from ppdet.modeling import ops +from ppdet.modeling import bbox_utils +from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner +import numpy as np + + +class S2ANetAnchorGenerator(object): + """ + S2ANetAnchorGenerator by np + """ + + def __init__(self, + base_size=8, + scales=1.0, + ratios=1.0, + scale_major=True, + ctr=None): + self.base_size = base_size + self.scales = scales + self.ratios = ratios + self.scale_major = scale_major + self.ctr = ctr + self.base_anchors = self.gen_base_anchors() + + @property + def num_base_anchors(self): + return self.base_anchors.shape[0] + + def gen_base_anchors(self): + w = self.base_size + h = self.base_size + if self.ctr is None: + x_ctr = 0.5 * (w - 1) + y_ctr = 0.5 * (h - 1) + else: + x_ctr, y_ctr = self.ctr + + h_ratios = np.sqrt(self.ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:] * self.scales[:]).reshape([-1]) + hs = (h * h_ratios[:] * self.scales[:]).reshape([-1]) + else: + ws = (w * self.scales[:] * w_ratios[:]).reshape([-1]) + hs = (h * self.scales[:] * h_ratios[:]).reshape([-1]) + + # yapf: disable + base_anchors = np.stack( + [ + x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) + ], + axis=-1) + base_anchors = np.round(base_anchors) + # yapf: enable + + return base_anchors + + def _meshgrid(self, x, y, row_major=True): + xx, yy = np.meshgrid(x, y) + xx = xx.reshape(-1) + yy = yy.reshape(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def grid_anchors(self, featmap_size, stride=16): + # featmap_size*stride project it to original area + base_anchors = self.base_anchors + feat_h, feat_w = featmap_size + shift_x = np.arange(0, feat_w, 1, 'int32') * stride + shift_y = np.arange(0, feat_h, 1, 'int32') * stride + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = np.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1) + # shifts = shifts.type_as(base_anchors) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + #all_anchors = base_anchors[:, :] + shifts[:, :] + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + # all_anchors = all_anchors.reshape([-1, 4]) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + return all_anchors + + def valid_flags(self, featmap_size, valid_size): + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = np.zeros([feat_w], dtype='uint8') + valid_y = np.zeros([feat_h], dtype='uint8') + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + valid = valid.reshape([-1]) + + # valid = valid[:, None].expand( + # [valid.size(0), self.num_base_anchors]).reshape([-1]) + return valid + + +class AlignConv(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): + super(AlignConv, self).__init__() + self.kernel_size = kernel_size + self.align_conv = paddle.vision.ops.DeformConv2D( + in_channels, + out_channels, + kernel_size=self.kernel_size, + padding=(self.kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=None) + + @paddle.no_grad() + def get_offset(self, anchors, featmap_size, stride): + """ + Args: + anchors: [M,5] xc,yc,w,h,angle + featmap_size: (feat_h, feat_w) + stride: 8 + Returns: + + """ + anchors = paddle.reshape(anchors, [-1, 5]) # (NA,5) + dtype = anchors.dtype + feat_h, feat_w = featmap_size + pad = (self.kernel_size - 1) // 2 + idx = paddle.arange(-pad, pad + 1, dtype=dtype) + + yy, xx = paddle.meshgrid(idx, idx) + xx = paddle.reshape(xx, [-1]) + yy = paddle.reshape(yy, [-1]) + + # get sampling locations of default conv + xc = paddle.arange(0, feat_w, dtype=dtype) + yc = paddle.arange(0, feat_h, dtype=dtype) + yc, xc = paddle.meshgrid(yc, xc) + + xc = paddle.reshape(xc, [-1, 1]) + yc = paddle.reshape(yc, [-1, 1]) + x_conv = xc + xx + y_conv = yc + yy + + # get sampling locations of anchors + # x_ctr, y_ctr, w, h, a = np.unbind(anchors, dim=1) + x_ctr = anchors[:, 0] + y_ctr = anchors[:, 1] + w = anchors[:, 2] + h = anchors[:, 3] + a = anchors[:, 4] + + x_ctr = paddle.reshape(x_ctr, [x_ctr.shape[0], 1]) + y_ctr = paddle.reshape(y_ctr, [y_ctr.shape[0], 1]) + w = paddle.reshape(w, [w.shape[0], 1]) + h = paddle.reshape(h, [h.shape[0], 1]) + a = paddle.reshape(a, [a.shape[0], 1]) + + x_ctr = x_ctr / stride + y_ctr = y_ctr / stride + w_s = w / stride + h_s = h / stride + cos, sin = paddle.cos(a), paddle.sin(a) + dw, dh = w_s / self.kernel_size, h_s / self.kernel_size + x, y = dw * xx, dh * yy + xr = cos * x - sin * y + yr = sin * x + cos * y + x_anchor, y_anchor = xr + x_ctr, yr + y_ctr + # get offset filed + offset_x = x_anchor - x_conv + offset_y = y_anchor - y_conv + # x, y in anchors is opposite in image coordinates, + # so we stack them with y, x other than x, y + offset = paddle.stack([offset_y, offset_x], axis=-1) + # NA,ks*ks*2 + # [NA, ks, ks, 2] --> [NA, ks*ks*2] + offset = paddle.reshape(offset, [offset.shape[0], -1]) + # [NA, ks*ks*2] --> [ks*ks*2, NA] + offset = paddle.transpose(offset, [1, 0]) + # [NA, ks*ks*2] --> [1, ks*ks*2, H, W] + offset = paddle.reshape(offset, [1, -1, feat_h, feat_w]) + return offset + + def forward(self, x, refine_anchors, stride): + featmap_size = (x.shape[2], x.shape[3]) + offset = self.get_offset(refine_anchors, featmap_size, stride) + x = F.relu(self.align_conv(x, offset)) + return x + + +@register +class S2ANetHead(nn.Layer): + """ + S2Anet head + Args: + stacked_convs (int): number of stacked_convs + feat_in (int): input channels of feat + feat_out (int): output channels of feat + num_classes (int): num_classes + anchor_strides (list): stride of anchors + anchor_scales (list): scale of anchors + anchor_ratios (list): ratios of anchors + target_means (list): target_means + target_stds (list): target_stds + align_conv_type (str): align_conv_type ['Conv', 'AlignConv'] + align_conv_size (int): kernel size of align_conv + use_sigmoid_cls (bool): use sigmoid_cls or not + reg_loss_weight (list): loss weight for regression + """ + __shared__ = ['num_classes'] + __inject__ = ['anchor_assign'] + + def __init__(self, + stacked_convs=2, + feat_in=256, + feat_out=256, + num_classes=15, + anchor_strides=[8, 16, 32, 64, 128], + anchor_scales=[4], + anchor_ratios=[1.0], + target_means=(.0, .0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0, 1.0), + align_conv_type='AlignConv', + align_conv_size=3, + use_sigmoid_cls=True, + anchor_assign=RBoxAssigner().__dict__, + reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.0]): + super(S2ANetHead, self).__init__() + self.stacked_convs = stacked_convs + self.feat_in = feat_in + self.feat_out = feat_out + self.anchor_list = None + self.anchor_scales = anchor_scales + self.anchor_ratios = anchor_ratios + self.anchor_strides = anchor_strides + self.anchor_base_sizes = list(anchor_strides) + self.target_means = target_means + self.target_stds = target_stds + assert align_conv_type in ['AlignConv', 'Conv', 'DCN'] + self.align_conv_type = align_conv_type + self.align_conv_size = align_conv_size + + self.use_sigmoid_cls = use_sigmoid_cls + self.cls_out_channels = num_classes if self.use_sigmoid_cls else 1 + self.sampling = False + self.anchor_assign = anchor_assign + self.reg_loss_weight = reg_loss_weight + + self.s2anet_head_out = None + + # anchor + self.anchor_generators = [] + for anchor_base in self.anchor_base_sizes: + self.anchor_generators.append( + S2ANetAnchorGenerator(anchor_base, anchor_scales, + anchor_ratios)) + + self.fam_cls_convs = nn.Sequential() + self.fam_reg_convs = nn.Sequential() + + for i in range(self.stacked_convs): + chan_in = self.feat_in if i == 0 else self.feat_out + + self.fam_cls_convs.add_sublayer( + 'fam_cls_conv_{}'.format(i), + nn.Conv2D( + in_channels=chan_in, + out_channels=self.feat_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0)))) + + self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i), + nn.ReLU()) + + self.fam_reg_convs.add_sublayer( + 'fam_reg_conv_{}'.format(i), + nn.Conv2D( + in_channels=chan_in, + out_channels=self.feat_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0)))) + + self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i), + nn.ReLU()) + + self.fam_reg = nn.Conv2D( + self.feat_out, + 5, + 1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0))) + prior_prob = 0.01 + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + self.fam_cls = nn.Conv2D( + self.feat_out, + self.cls_out_channels, + 1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(bias_init))) + + if self.align_conv_type == "AlignConv": + self.align_conv = AlignConv(self.feat_out, self.feat_out, + self.align_conv_size) + elif self.align_conv_type == "Conv": + self.align_conv = nn.Conv2D( + self.feat_out, + self.feat_out, + self.align_conv_size, + padding=(self.align_conv_size - 1) // 2, + bias_attr=ParamAttr(initializer=Constant(0))) + + elif self.align_conv_type == "DCN": + self.align_conv_offset = nn.Conv2D( + self.feat_out, + 2 * self.align_conv_size**2, + 1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0))) + + self.align_conv = paddle.vision.ops.DeformConv2D( + self.feat_out, + self.feat_out, + self.align_conv_size, + padding=(self.align_conv_size - 1) // 2, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=False) + + self.or_conv = nn.Conv2D( + self.feat_out, + self.feat_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0))) + + # ODM + self.odm_cls_convs = nn.Sequential() + self.odm_reg_convs = nn.Sequential() + + for i in range(self.stacked_convs): + ch_in = self.feat_out + # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out + + self.odm_cls_convs.add_sublayer( + 'odm_cls_conv_{}'.format(i), + nn.Conv2D( + in_channels=ch_in, + out_channels=self.feat_out, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0)))) + + self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i), + nn.ReLU()) + + self.odm_reg_convs.add_sublayer( + 'odm_reg_conv_{}'.format(i), + nn.Conv2D( + in_channels=self.feat_out, + out_channels=self.feat_out, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0)))) + + self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i), + nn.ReLU()) + + self.odm_cls = nn.Conv2D( + self.feat_out, + self.cls_out_channels, + 3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(bias_init))) + self.odm_reg = nn.Conv2D( + self.feat_out, + 5, + 3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), + bias_attr=ParamAttr(initializer=Constant(0))) + + self.base_anchors = dict() + self.featmap_sizes = dict() + self.base_anchors = dict() + self.refine_anchor_list = [] + + def forward(self, feats): + fam_reg_branch_list = [] + fam_cls_branch_list = [] + + odm_reg_branch_list = [] + odm_cls_branch_list = [] + + self.featmap_sizes = dict() + self.base_anchors = dict() + self.refine_anchor_list = [] + + for i, feat in enumerate(feats): + fam_cls_feat = self.fam_cls_convs(feat) + + fam_cls = self.fam_cls(fam_cls_feat) + # [N, CLS, H, W] --> [N, H, W, CLS] + fam_cls = fam_cls.transpose([0, 2, 3, 1]) + fam_cls_reshape = paddle.reshape( + fam_cls, [fam_cls.shape[0], -1, self.cls_out_channels]) + fam_cls_branch_list.append(fam_cls_reshape) + + fam_reg_feat = self.fam_reg_convs(feat) + + fam_reg = self.fam_reg(fam_reg_feat) + # [N, 5, H, W] --> [N, H, W, 5] + fam_reg = fam_reg.transpose([0, 2, 3, 1]) + fam_reg_reshape = paddle.reshape(fam_reg, [fam_reg.shape[0], -1, 5]) + fam_reg_branch_list.append(fam_reg_reshape) + + # prepare anchor + featmap_size = feat.shape[-2:] + self.featmap_sizes[i] = featmap_size + init_anchors = self.anchor_generators[i].grid_anchors( + featmap_size, self.anchor_strides[i]) + + init_anchors = bbox_utils.rect2rbox(init_anchors) + self.base_anchors[(i, featmap_size[0])] = init_anchors + + #fam_reg1 = fam_reg + #fam_reg1.stop_gradient = True + refine_anchor = bbox_utils.bbox_decode( + fam_reg.detach(), init_anchors, self.target_means, + self.target_stds) + + self.refine_anchor_list.append(refine_anchor) + + if self.align_conv_type == 'AlignConv': + align_feat = self.align_conv(feat, + refine_anchor.clone(), + self.anchor_strides[i]) + elif self.align_conv_type == 'DCN': + align_offset = self.align_conv_offset(feat) + align_feat = self.align_conv(feat, align_offset) + elif self.align_conv_type == 'Conv': + align_feat = self.align_conv(feat) + + or_feat = self.or_conv(align_feat) + odm_reg_feat = or_feat + odm_cls_feat = or_feat + + odm_reg_feat = self.odm_reg_convs(odm_reg_feat) + odm_cls_feat = self.odm_cls_convs(odm_cls_feat) + + odm_cls_score = self.odm_cls(odm_cls_feat) + # [N, CLS, H, W] --> [N, H, W, CLS] + odm_cls_score = odm_cls_score.transpose([0, 2, 3, 1]) + odm_cls_score_reshape = paddle.reshape( + odm_cls_score, + [odm_cls_score.shape[0], -1, self.cls_out_channels]) + + odm_cls_branch_list.append(odm_cls_score_reshape) + + odm_bbox_pred = self.odm_reg(odm_reg_feat) + # [N, 5, H, W] --> [N, H, W, 5] + odm_bbox_pred = odm_bbox_pred.transpose([0, 2, 3, 1]) + odm_bbox_pred_reshape = paddle.reshape( + odm_bbox_pred, [odm_bbox_pred.shape[0], -1, 5]) + odm_reg_branch_list.append(odm_bbox_pred_reshape) + + self.s2anet_head_out = (fam_cls_branch_list, fam_reg_branch_list, + odm_cls_branch_list, odm_reg_branch_list) + return self.s2anet_head_out + + def get_prediction(self, nms_pre): + refine_anchors = self.refine_anchor_list + fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list = self.s2anet_head_out + pred_scores, pred_bboxes = self.get_bboxes( + odm_cls_branch_list, + odm_reg_branch_list, + refine_anchors, + nms_pre, + cls_out_channels=self.cls_out_channels, + use_sigmoid_cls=self.use_sigmoid_cls) + return pred_scores, pred_bboxes + + def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0): + """ + Args: + pred: pred score + label: label + delta: delta + Returns: loss + """ + assert pred.shape == label.shape and label.numel() > 0 + assert delta > 0 + diff = paddle.abs(pred - label) + loss = paddle.where(diff < delta, 0.5 * diff * diff / delta, + diff - 0.5 * delta) + return loss + + def get_fam_loss(self, fam_target, s2anet_head_out): + (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) = fam_target + fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list = s2anet_head_out + + fam_cls_losses = [] + fam_bbox_losses = [] + st_idx = 0 + featmap_sizes = [self.featmap_sizes[e] for e in self.featmap_sizes] + num_total_samples = len(pos_inds) + len( + neg_inds) if self.sampling else len(pos_inds) + num_total_samples = max(1, num_total_samples) + + for idx, feat_size in enumerate(featmap_sizes): + feat_anchor_num = feat_size[0] * feat_size[1] + + # step1: get data + feat_labels = labels[st_idx:st_idx + feat_anchor_num] + feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] + + feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] + feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] + st_idx += feat_anchor_num + + # step2: calc cls loss + feat_labels = feat_labels.reshape(-1) + feat_label_weights = feat_label_weights.reshape(-1) + + fam_cls_score = fam_cls_branch_list[idx] + fam_cls_score = paddle.squeeze(fam_cls_score, axis=0) + fam_cls_score1 = fam_cls_score + + # gt_classes 0~14(data), feat_labels 0~14, sigmoid_focal_loss need class>=1 + feat_labels = paddle.to_tensor(feat_labels) + feat_labels_one_hot = paddle.nn.functional.one_hot( + feat_labels, self.cls_out_channels + 1) + feat_labels_one_hot = feat_labels_one_hot[:, 1:] + feat_labels_one_hot.stop_gradient = True + + num_total_samples = paddle.to_tensor( + num_total_samples, dtype='float32', stop_gradient=True) + + fam_cls = F.sigmoid_focal_loss( + fam_cls_score1, + feat_labels_one_hot, + normalizer=num_total_samples, + reduction='none') + + feat_label_weights = feat_label_weights.reshape( + feat_label_weights.shape[0], 1) + feat_label_weights = np.repeat( + feat_label_weights, self.cls_out_channels, axis=1) + feat_label_weights = paddle.to_tensor( + feat_label_weights, stop_gradient=True) + + fam_cls = fam_cls * feat_label_weights + fam_cls_total = paddle.sum(fam_cls) + fam_cls_losses.append(fam_cls_total) + + # step3: regression loss + fam_bbox_pred = fam_reg_branch_list[idx] + feat_bbox_targets = paddle.to_tensor( + feat_bbox_targets, dtype='float32', stop_gradient=True) + feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) + + fam_bbox_pred = fam_reg_branch_list[idx] + fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0) + fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5]) + fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets) + loss_weight = paddle.to_tensor( + self.reg_loss_weight, dtype='float32', stop_gradient=True) + fam_bbox = paddle.multiply(fam_bbox, loss_weight) + feat_bbox_weights = paddle.to_tensor( + feat_bbox_weights, stop_gradient=True) + fam_bbox = fam_bbox * feat_bbox_weights + fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples + + fam_bbox_losses.append(fam_bbox_total) + + fam_cls_loss = paddle.add_n(fam_cls_losses) + fam_cls_loss = fam_cls_loss * 2.0 + fam_reg_loss = paddle.add_n(fam_bbox_losses) + return fam_cls_loss, fam_reg_loss + + def get_odm_loss(self, odm_target, s2anet_head_out): + (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) = odm_target + fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list = s2anet_head_out + + odm_cls_losses = [] + odm_bbox_losses = [] + st_idx = 0 + featmap_sizes = [self.featmap_sizes[e] for e in self.featmap_sizes] + num_total_samples = len(pos_inds) + len( + neg_inds) if self.sampling else len(pos_inds) + num_total_samples = max(1, num_total_samples) + for idx, feat_size in enumerate(featmap_sizes): + feat_anchor_num = feat_size[0] * feat_size[1] + + # step1: get data + feat_labels = labels[st_idx:st_idx + feat_anchor_num] + feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] + + feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] + feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] + st_idx += feat_anchor_num + + # step2: calc cls loss + feat_labels = feat_labels.reshape(-1) + feat_label_weights = feat_label_weights.reshape(-1) + + odm_cls_score = odm_cls_branch_list[idx] + odm_cls_score = paddle.squeeze(odm_cls_score, axis=0) + odm_cls_score1 = odm_cls_score + + # gt_classes 0~14(data), feat_labels 0~14, sigmoid_focal_loss need class>=1 + feat_labels = paddle.to_tensor(feat_labels) + feat_labels_one_hot = paddle.nn.functional.one_hot( + feat_labels, self.cls_out_channels + 1) + feat_labels_one_hot = feat_labels_one_hot[:, 1:] + feat_labels_one_hot.stop_gradient = True + + num_total_samples = paddle.to_tensor( + num_total_samples, dtype='float32', stop_gradient=True) + odm_cls = F.sigmoid_focal_loss( + odm_cls_score1, + feat_labels_one_hot, + normalizer=num_total_samples, + reduction='none') + + feat_label_weights = feat_label_weights.reshape( + feat_label_weights.shape[0], 1) + feat_label_weights = np.repeat( + feat_label_weights, self.cls_out_channels, axis=1) + feat_label_weights = paddle.to_tensor(feat_label_weights) + feat_label_weights.stop_gradient = True + + odm_cls = odm_cls * feat_label_weights + odm_cls_total = paddle.sum(odm_cls) + odm_cls_losses.append(odm_cls_total) + + # # step3: regression loss + feat_bbox_targets = paddle.to_tensor( + feat_bbox_targets, dtype='float32') + feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) + feat_bbox_targets.stop_gradient = True + + odm_bbox_pred = odm_reg_branch_list[idx] + odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0) + odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5]) + odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets) + loss_weight = paddle.to_tensor( + self.reg_loss_weight, dtype='float32', stop_gradient=True) + odm_bbox = paddle.multiply(odm_bbox, loss_weight) + feat_bbox_weights = paddle.to_tensor( + feat_bbox_weights, stop_gradient=True) + odm_bbox = odm_bbox * feat_bbox_weights + odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples + odm_bbox_losses.append(odm_bbox_total) + + odm_cls_loss = paddle.add_n(odm_cls_losses) + odm_cls_loss = odm_cls_loss * 2.0 + odm_reg_loss = paddle.add_n(odm_bbox_losses) + return odm_cls_loss, odm_reg_loss + + def get_loss(self, inputs): + # inputs: im_id image im_shape scale_factor gt_bbox gt_class is_crowd + + # compute loss + fam_cls_loss_lst = [] + fam_reg_loss_lst = [] + odm_cls_loss_lst = [] + odm_reg_loss_lst = [] + + im_shape = inputs['im_shape'] + for im_id in range(im_shape.shape[0]): + np_im_shape = inputs['im_shape'][im_id].numpy() + np_scale_factor = inputs['scale_factor'][im_id].numpy() + # data_format: (xc, yc, w, h, theta) + gt_bboxes = inputs['gt_rbox'][im_id].numpy() + gt_labels = inputs['gt_class'][im_id].numpy() + is_crowd = inputs['is_crowd'][im_id].numpy() + gt_labels = gt_labels + 1 + + # featmap_sizes + featmap_sizes = [self.featmap_sizes[e] for e in self.featmap_sizes] + anchors_list, valid_flag_list = self.get_init_anchors(featmap_sizes, + np_im_shape) + anchors_list_all = [] + for ii, anchor in enumerate(anchors_list): + anchor = anchor.reshape(-1, 4) + anchor = bbox_utils.rect2rbox(anchor) + anchors_list_all.extend(anchor) + anchors_list_all = np.array(anchors_list_all) + + # get im_feat + fam_cls_feats_list = [e[im_id] for e in self.s2anet_head_out[0]] + fam_reg_feats_list = [e[im_id] for e in self.s2anet_head_out[1]] + odm_cls_feats_list = [e[im_id] for e in self.s2anet_head_out[2]] + odm_reg_feats_list = [e[im_id] for e in self.s2anet_head_out[3]] + im_s2anet_head_out = (fam_cls_feats_list, fam_reg_feats_list, + odm_cls_feats_list, odm_reg_feats_list) + + # FAM + im_fam_target = self.anchor_assign(anchors_list_all, gt_bboxes, + gt_labels, is_crowd) + if im_fam_target is not None: + im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss( + im_fam_target, im_s2anet_head_out) + fam_cls_loss_lst.append(im_fam_cls_loss) + fam_reg_loss_lst.append(im_fam_reg_loss) + + # ODM + refine_anchors_list, valid_flag_list = self.get_refine_anchors( + featmap_sizes, image_shape=np_im_shape) + refine_anchors_list = np.array(refine_anchors_list) + im_odm_target = self.anchor_assign(refine_anchors_list, gt_bboxes, + gt_labels, is_crowd) + + if im_odm_target is not None: + im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss( + im_odm_target, im_s2anet_head_out) + odm_cls_loss_lst.append(im_odm_cls_loss) + odm_reg_loss_lst.append(im_odm_reg_loss) + fam_cls_loss = paddle.add_n(fam_cls_loss_lst) + fam_reg_loss = paddle.add_n(fam_reg_loss_lst) + odm_cls_loss = paddle.add_n(odm_cls_loss_lst) + odm_reg_loss = paddle.add_n(odm_reg_loss_lst) + return { + 'fam_cls_loss': fam_cls_loss, + 'fam_reg_loss': fam_reg_loss, + 'odm_cls_loss': odm_cls_loss, + 'odm_reg_loss': odm_reg_loss + } + + def get_init_anchors(self, featmap_sizes, image_shape): + """Get anchors according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + image_shape (list[dict]): Image meta info. + Returns: + tuple: anchors of each image, valid flags of each image + """ + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # anchors for one time + anchor_list = [] + for i in range(num_levels): + anchors = self.anchor_generators[i].grid_anchors( + featmap_sizes[i], self.anchor_strides[i]) + anchor_list.append(anchors) + + # for each image, we compute valid flags of multi level anchors + valid_flag_list = [] + for i in range(num_levels): + anchor_stride = self.anchor_strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = image_shape + valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h) + valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w) + flags = self.anchor_generators[i].valid_flags( + (feat_h, feat_w), (valid_feat_h, valid_feat_w)) + valid_flag_list.append(flags) + + return anchor_list, valid_flag_list + + def get_refine_anchors(self, featmap_sizes, image_shape): + num_levels = len(featmap_sizes) + + refine_anchors_list = [] + for i in range(num_levels): + refine_anchor = self.refine_anchor_list[i] + refine_anchor = paddle.squeeze(refine_anchor, axis=0) + refine_anchor = refine_anchor.numpy() + refine_anchor = np.reshape(refine_anchor, + [-1, refine_anchor.shape[-1]]) + refine_anchors_list.extend(refine_anchor) + + # for each image, we compute valid flags of multi level anchors + valid_flag_list = [] + for i in range(num_levels): + anchor_stride = self.anchor_strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = image_shape + valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h) + valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w) + flags = self.anchor_generators[i].valid_flags( + (feat_h, feat_w), (valid_feat_h, valid_feat_w)) + valid_flag_list.append(flags) + + return refine_anchors_list, valid_flag_list + + def get_bboxes(self, cls_score_list, bbox_pred_list, mlvl_anchors, nms_pre, + cls_out_channels, use_sigmoid_cls): + assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors) + + mlvl_bboxes = [] + mlvl_scores = [] + + idx = 0 + for cls_score, bbox_pred, anchors in zip(cls_score_list, bbox_pred_list, + mlvl_anchors): + cls_score = paddle.reshape(cls_score, [-1, cls_out_channels]) + if use_sigmoid_cls: + scores = F.sigmoid(cls_score) + else: + scores = F.softmax(cls_score, axis=-1) + + # bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 5) + bbox_pred = paddle.transpose(bbox_pred, [1, 2, 0]) + bbox_pred = paddle.reshape(bbox_pred, [-1, 5]) + anchors = paddle.reshape(anchors, [-1, 5]) + + if nms_pre > 0 and scores.shape[0] > nms_pre: + # Get maximum scores for foreground classes. + if use_sigmoid_cls: + max_scores = paddle.max(scores, axis=1) + else: + max_scores = paddle.max(scores[:, 1:], axis=1) + + topk_val, topk_inds = paddle.topk(max_scores, nms_pre) + anchors = paddle.gather(anchors, topk_inds) + bbox_pred = paddle.gather(bbox_pred, topk_inds) + scores = paddle.gather(scores, topk_inds) + + target_means = (.0, .0, .0, .0, .0) + target_stds = (1.0, 1.0, 1.0, 1.0, 1.0) + bboxes = bbox_utils.delta2rbox(anchors, bbox_pred, target_means, + target_stds) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + + idx += 1 + + mlvl_bboxes = paddle.concat(mlvl_bboxes, axis=0) + mlvl_scores = paddle.concat(mlvl_scores) + if use_sigmoid_cls: + # Add a dummy background class to the front when using sigmoid + padding = paddle.zeros( + [mlvl_scores.shape[0], 1], dtype=mlvl_scores.dtype) + mlvl_scores = paddle.concat([padding, mlvl_scores], axis=1) + + return mlvl_scores, mlvl_bboxes diff --git a/ppdet/modeling/heads/solov2_head.py b/ppdet/modeling/heads/solov2_head.py new file mode 100644 index 0000000..5f15461 --- /dev/null +++ b/ppdet/modeling/heads/solov2_head.py @@ -0,0 +1,535 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, Constant + +from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS +from ppdet.core.workspace import register + +from six.moves import zip +import numpy as np + +__all__ = ['SOLOv2Head'] + + +@register +class SOLOv2MaskHead(nn.Layer): + """ + MaskHead of SOLOv2 + + Args: + in_channels (int): The channel number of input Tensor. + out_channels (int): The channel number of output Tensor. + start_level (int): The position where the input starts. + end_level (int): The position where the input ends. + use_dcn_in_tower (bool): Whether to use dcn in tower or not. + """ + + def __init__(self, + in_channels=256, + mid_channels=128, + out_channels=256, + start_level=0, + end_level=3, + use_dcn_in_tower=False): + super(SOLOv2MaskHead, self).__init__() + assert start_level >= 0 and end_level >= start_level + self.in_channels = in_channels + self.out_channels = out_channels + self.mid_channels = mid_channels + self.use_dcn_in_tower = use_dcn_in_tower + self.range_level = end_level - start_level + 1 + # TODO: add DeformConvNorm + conv_type = [ConvNormLayer] + self.conv_func = conv_type[0] + if self.use_dcn_in_tower: + self.conv_func = conv_type[1] + self.convs_all_levels = [] + for i in range(start_level, end_level + 1): + conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i) + conv_pre_feat = nn.Sequential() + if i == start_level: + conv_pre_feat.add_sublayer( + conv_feat_name + '.conv' + str(i), + self.conv_func( + ch_in=self.in_channels, + ch_out=self.mid_channels, + filter_size=3, + stride=1, + norm_type='gn')) + self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) + self.convs_all_levels.append(conv_pre_feat) + else: + for j in range(i): + ch_in = 0 + if j == 0: + ch_in = self.in_channels + 2 if i == end_level else self.in_channels + else: + ch_in = self.mid_channels + conv_pre_feat.add_sublayer( + conv_feat_name + '.conv' + str(j), + self.conv_func( + ch_in=ch_in, + ch_out=self.mid_channels, + filter_size=3, + stride=1, + norm_type='gn')) + conv_pre_feat.add_sublayer( + conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU()) + conv_pre_feat.add_sublayer( + 'upsample' + str(i) + str(j), + nn.Upsample( + scale_factor=2, mode='bilinear')) + self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) + self.convs_all_levels.append(conv_pre_feat) + + conv_pred_name = 'mask_feat_head.conv_pred.0' + self.conv_pred = self.add_sublayer( + conv_pred_name, + self.conv_func( + ch_in=self.mid_channels, + ch_out=self.out_channels, + filter_size=1, + stride=1, + norm_type='gn')) + + def forward(self, inputs): + """ + Get SOLOv2MaskHead output. + + Args: + inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W] + Returns: + ins_pred(Tensor): Output of SOLOv2MaskHead head + """ + feat_all_level = F.relu(self.convs_all_levels[0](inputs[0])) + for i in range(1, self.range_level): + input_p = inputs[i] + if i == (self.range_level - 1): + input_feat = input_p + x_range = paddle.linspace( + -1, 1, paddle.shape(input_feat)[-1], dtype='float32') + y_range = paddle.linspace( + -1, 1, paddle.shape(input_feat)[-2], dtype='float32') + y, x = paddle.meshgrid([y_range, x_range]) + x = paddle.unsqueeze(x, [0, 1]) + y = paddle.unsqueeze(y, [0, 1]) + y = paddle.expand( + y, shape=[paddle.shape(input_feat)[0], 1, -1, -1]) + x = paddle.expand( + x, shape=[paddle.shape(input_feat)[0], 1, -1, -1]) + coord_feat = paddle.concat([x, y], axis=1) + input_p = paddle.concat([input_p, coord_feat], axis=1) + feat_all_level = paddle.add(feat_all_level, + self.convs_all_levels[i](input_p)) + ins_pred = F.relu(self.conv_pred(feat_all_level)) + + return ins_pred + + +@register +class SOLOv2Head(nn.Layer): + """ + Head block for SOLOv2 network + + Args: + num_classes (int): Number of output classes. + in_channels (int): Number of input channels. + seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation. + stacked_convs (int): Times of convolution operation. + num_grids (list[int]): List of feature map grids size. + kernel_out_channels (int): Number of output channels in kernel branch. + dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs). + segm_strides (list[int]): List of segmentation area stride. + solov2_loss (object): SOLOv2Loss instance. + score_threshold (float): Threshold of categroy score. + mask_nms (object): MaskMatrixNMS instance. + """ + __inject__ = ['solov2_loss', 'mask_nms'] + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=80, + in_channels=256, + seg_feat_channels=256, + stacked_convs=4, + num_grids=[40, 36, 24, 16, 12], + kernel_out_channels=256, + dcn_v2_stages=[], + segm_strides=[8, 8, 16, 32, 32], + solov2_loss=None, + score_threshold=0.1, + mask_threshold=0.5, + mask_nms=None): + super(SOLOv2Head, self).__init__() + self.num_classes = num_classes + self.in_channels = in_channels + self.seg_num_grids = num_grids + self.cate_out_channels = self.num_classes + self.seg_feat_channels = seg_feat_channels + self.stacked_convs = stacked_convs + self.kernel_out_channels = kernel_out_channels + self.dcn_v2_stages = dcn_v2_stages + self.segm_strides = segm_strides + self.solov2_loss = solov2_loss + self.mask_nms = mask_nms + self.score_threshold = score_threshold + self.mask_threshold = mask_threshold + + conv_type = [ConvNormLayer] + self.conv_func = conv_type[0] + self.kernel_pred_convs = [] + self.cate_pred_convs = [] + for i in range(self.stacked_convs): + if i in self.dcn_v2_stages: + self.conv_func = conv_type[1] + ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels + kernel_conv = self.add_sublayer( + 'bbox_head.kernel_convs.' + str(i), + self.conv_func( + ch_in=ch_in, + ch_out=self.seg_feat_channels, + filter_size=3, + stride=1, + norm_type='gn')) + self.kernel_pred_convs.append(kernel_conv) + ch_in = self.in_channels if i == 0 else self.seg_feat_channels + cate_conv = self.add_sublayer( + 'bbox_head.cate_convs.' + str(i), + self.conv_func( + ch_in=ch_in, + ch_out=self.seg_feat_channels, + filter_size=3, + stride=1, + norm_type='gn')) + self.cate_pred_convs.append(cate_conv) + + self.solo_kernel = self.add_sublayer( + 'bbox_head.solo_kernel', + nn.Conv2D( + self.seg_feat_channels, + self.kernel_out_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal( + mean=0., std=0.01)), + bias_attr=True)) + self.solo_cate = self.add_sublayer( + 'bbox_head.solo_cate', + nn.Conv2D( + self.seg_feat_channels, + self.cate_out_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Normal( + mean=0., std=0.01)), + bias_attr=ParamAttr(initializer=Constant( + value=float(-np.log((1 - 0.01) / 0.01)))))) + + def _points_nms(self, heat, kernel_size=2): + hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1) + keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32') + return heat * keep + + def _split_feats(self, feats): + return (F.interpolate( + feats[0], + scale_factor=0.5, + align_corners=False, + align_mode=0, + mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate( + feats[4], + size=paddle.shape(feats[3])[-2:], + mode='bilinear', + align_corners=False, + align_mode=0)) + + def forward(self, input): + """ + Get SOLOv2 head output + + Args: + input (list): List of Tensors, output of backbone or neck stages + Returns: + cate_pred_list (list): Tensors of each category branch layer + kernel_pred_list (list): Tensors of each kernel branch layer + """ + feats = self._split_feats(input) + cate_pred_list = [] + kernel_pred_list = [] + for idx in range(len(self.seg_num_grids)): + cate_pred, kernel_pred = self._get_output_single(feats[idx], idx) + cate_pred_list.append(cate_pred) + kernel_pred_list.append(kernel_pred) + + return cate_pred_list, kernel_pred_list + + def _get_output_single(self, input, idx): + ins_kernel_feat = input + # CoordConv + x_range = paddle.linspace( + -1, 1, paddle.shape(ins_kernel_feat)[-1], dtype='float32') + y_range = paddle.linspace( + -1, 1, paddle.shape(ins_kernel_feat)[-2], dtype='float32') + y, x = paddle.meshgrid([y_range, x_range]) + x = paddle.unsqueeze(x, [0, 1]) + y = paddle.unsqueeze(y, [0, 1]) + y = paddle.expand( + y, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1]) + x = paddle.expand( + x, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1]) + coord_feat = paddle.concat([x, y], axis=1) + ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1) + + # kernel branch + kernel_feat = ins_kernel_feat + seg_num_grid = self.seg_num_grids[idx] + kernel_feat = F.interpolate( + kernel_feat, + size=[seg_num_grid, seg_num_grid], + mode='bilinear', + align_corners=False, + align_mode=0) + cate_feat = kernel_feat[:, :-2, :, :] + + for kernel_layer in self.kernel_pred_convs: + kernel_feat = F.relu(kernel_layer(kernel_feat)) + kernel_pred = self.solo_kernel(kernel_feat) + # cate branch + for cate_layer in self.cate_pred_convs: + cate_feat = F.relu(cate_layer(cate_feat)) + cate_pred = self.solo_cate(cate_feat) + + if not self.training: + cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2) + cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1]) + return cate_pred, kernel_pred + + def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels, + cate_labels, grid_order_list, fg_num): + """ + Get loss of network of SOLOv2. + + Args: + cate_preds (list): Tensor list of categroy branch output. + kernel_preds (list): Tensor list of kernel branch output. + ins_pred (list): Tensor list of instance branch output. + ins_labels (list): List of instance labels pre batch. + cate_labels (list): List of categroy labels pre batch. + grid_order_list (list): List of index in pre grid. + fg_num (int): Number of positive samples in a mini-batch. + Returns: + loss_ins (Tensor): The instance loss Tensor of SOLOv2 network. + loss_cate (Tensor): The category loss Tensor of SOLOv2 network. + """ + batch_size = paddle.shape(grid_order_list[0])[0] + ins_pred_list = [] + for kernel_preds_level, grid_orders_level in zip(kernel_preds, + grid_order_list): + if grid_orders_level.shape[1] == 0: + ins_pred_list.append(None) + continue + grid_orders_level = paddle.reshape(grid_orders_level, [-1]) + reshape_pred = paddle.reshape( + kernel_preds_level, + shape=(paddle.shape(kernel_preds_level)[0], + paddle.shape(kernel_preds_level)[1], -1)) + reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1]) + reshape_pred = paddle.reshape( + reshape_pred, shape=(-1, paddle.shape(reshape_pred)[2])) + gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level) + gathered_pred = paddle.reshape( + gathered_pred, + shape=[batch_size, -1, paddle.shape(gathered_pred)[1]]) + cur_ins_pred = ins_pred + cur_ins_pred = paddle.reshape( + cur_ins_pred, + shape=(paddle.shape(cur_ins_pred)[0], + paddle.shape(cur_ins_pred)[1], -1)) + ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred) + cur_ins_pred = paddle.reshape( + ins_pred_conv, + shape=(-1, paddle.shape(ins_pred)[-2], + paddle.shape(ins_pred)[-1])) + ins_pred_list.append(cur_ins_pred) + + num_ins = paddle.sum(fg_num) + cate_preds = [ + paddle.reshape( + paddle.transpose(cate_pred, [0, 2, 3, 1]), + shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds + ] + flatten_cate_preds = paddle.concat(cate_preds) + new_cate_labels = [] + for cate_label in cate_labels: + new_cate_labels.append(paddle.reshape(cate_label, shape=[-1])) + cate_labels = paddle.concat(new_cate_labels) + + loss_ins, loss_cate = self.solov2_loss( + ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins) + + return {'loss_ins': loss_ins, 'loss_cate': loss_cate} + + def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape, + scale_factor): + """ + Get prediction result of SOLOv2 network + + Args: + cate_preds (list): List of Variables, output of categroy branch. + kernel_preds (list): List of Variables, output of kernel branch. + seg_pred (list): List of Variables, output of mask head stages. + im_shape (Variables): [h, w] for input images. + scale_factor (Variables): [scale, scale] for input images. + Returns: + seg_masks (Tensor): The prediction segmentation. + cate_labels (Tensor): The prediction categroy label of each segmentation. + seg_masks (Tensor): The prediction score of each segmentation. + """ + num_levels = len(cate_preds) + featmap_size = paddle.shape(seg_pred)[-2:] + seg_masks_list = [] + cate_labels_list = [] + cate_scores_list = [] + cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds] + kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds] + # Currently only supports batch size == 1 + for idx in range(1): + cate_pred_list = [ + paddle.reshape( + cate_preds[i][idx], shape=(-1, self.cate_out_channels)) + for i in range(num_levels) + ] + seg_pred_list = seg_pred + kernel_pred_list = [ + paddle.reshape( + paddle.transpose(kernel_preds[i][idx], [1, 2, 0]), + shape=(-1, self.kernel_out_channels)) + for i in range(num_levels) + ] + cate_pred_list = paddle.concat(cate_pred_list, axis=0) + kernel_pred_list = paddle.concat(kernel_pred_list, axis=0) + + seg_masks, cate_labels, cate_scores = self.get_seg_single( + cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size, + im_shape[idx], scale_factor[idx][0]) + bbox_num = paddle.shape(cate_labels)[0] + return seg_masks, cate_labels, cate_scores, bbox_num + + def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size, + im_shape, scale_factor): + h = paddle.cast(im_shape[0], 'int32')[0] + w = paddle.cast(im_shape[1], 'int32')[0] + upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4] + + y = paddle.zeros(shape=paddle.shape(cate_preds), dtype='float32') + inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y) + inds = paddle.nonzero(inds) + if paddle.shape(inds)[0] == 0: + out = paddle.full(shape=[1], fill_value=-1) + return out, out, out + cate_preds = paddle.reshape(cate_preds, shape=[-1]) + # Prevent empty and increase fake data + ind_a = paddle.cast(paddle.shape(kernel_preds)[0], 'int64') + ind_b = paddle.zeros(shape=[1], dtype='int64') + inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0) + inds = paddle.concat([inds, inds_end]) + kernel_preds_end = paddle.ones( + shape=[1, self.kernel_out_channels], dtype='float32') + kernel_preds = paddle.concat([kernel_preds, kernel_preds_end]) + cate_preds = paddle.concat( + [cate_preds, paddle.zeros( + shape=[1], dtype='float32')]) + + # cate_labels & kernel_preds + cate_labels = inds[:, 1] + kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0]) + cate_score_idx = paddle.add(inds[:, 0] * 80, cate_labels) + cate_scores = paddle.gather(cate_preds, index=cate_score_idx) + + size_trans = np.power(self.seg_num_grids, 2) + strides = [] + for _ind in range(len(self.segm_strides)): + strides.append( + paddle.full( + shape=[int(size_trans[_ind])], + fill_value=self.segm_strides[_ind], + dtype="int32")) + strides = paddle.concat(strides) + strides = paddle.gather(strides, index=inds[:, 0]) + + # mask encoding. + kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3]) + seg_preds = F.conv2d(seg_preds, kernel_preds) + seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0])) + seg_masks = seg_preds > self.mask_threshold + seg_masks = paddle.cast(seg_masks, 'float32') + sum_masks = paddle.sum(seg_masks, axis=[1, 2]) + + y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32') + keep = paddle.where(sum_masks > strides, sum_masks, y) + keep = paddle.nonzero(keep) + keep = paddle.squeeze(keep, axis=[1]) + # Prevent empty and increase fake data + keep_other = paddle.concat( + [keep, paddle.cast(paddle.shape(sum_masks)[0] - 1, 'int64')]) + keep_scores = paddle.concat( + [keep, paddle.cast(paddle.shape(sum_masks)[0], 'int64')]) + cate_scores_end = paddle.zeros(shape=[1], dtype='float32') + cate_scores = paddle.concat([cate_scores, cate_scores_end]) + + seg_masks = paddle.gather(seg_masks, index=keep_other) + seg_preds = paddle.gather(seg_preds, index=keep_other) + sum_masks = paddle.gather(sum_masks, index=keep_other) + cate_labels = paddle.gather(cate_labels, index=keep_other) + cate_scores = paddle.gather(cate_scores, index=keep_scores) + + # mask scoring. + seg_mul = paddle.cast(seg_preds * seg_masks, 'float32') + seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks + cate_scores *= seg_scores + # Matrix NMS + seg_preds, cate_scores, cate_labels = self.mask_nms( + seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks) + ori_shape = im_shape[:2] / scale_factor + 0.5 + ori_shape = paddle.cast(ori_shape, 'int32') + seg_preds = F.interpolate( + paddle.unsqueeze(seg_preds, 0), + size=upsampled_size_out, + mode='bilinear', + align_corners=False, + align_mode=0) + seg_preds = paddle.slice( + seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w]) + seg_masks = paddle.squeeze( + F.interpolate( + seg_preds, + size=ori_shape[:2], + mode='bilinear', + align_corners=False, + align_mode=0), + axis=[0]) + # TODO: support bool type + seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'int32') + return seg_masks, cate_labels, cate_scores diff --git a/ppdet/modeling/heads/ssd_head.py b/ppdet/modeling/heads/ssd_head.py new file mode 100644 index 0000000..96ed5e4 --- /dev/null +++ b/ppdet/modeling/heads/ssd_head.py @@ -0,0 +1,175 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from paddle.regularizer import L2Decay +from paddle import ParamAttr + +from ..layers import AnchorGeneratorSSD + + +class SepConvLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + padding=1, + conv_decay=0): + super(SepConvLayer, self).__init__() + self.dw_conv = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=1, + padding=padding, + groups=in_channels, + weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), + bias_attr=False) + + self.bn = nn.BatchNorm2D( + in_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.)), + bias_attr=ParamAttr(regularizer=L2Decay(0.))) + + self.pw_conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), + bias_attr=False) + + def forward(self, x): + x = self.dw_conv(x) + x = F.relu6(self.bn(x)) + x = self.pw_conv(x) + return x + + +@register +class SSDHead(nn.Layer): + """ + SSDHead + + Args: + num_classes (int): Number of classes + in_channels (list): Number of channels per input feature + anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance + kernel_size (int): Conv kernel size + padding (int): Conv padding + use_sepconv (bool): Use SepConvLayer if true + conv_decay (float): Conv regularization coeff + loss (object): 'SSDLoss' instance + """ + + __shared__ = ['num_classes'] + __inject__ = ['anchor_generator', 'loss'] + + def __init__(self, + num_classes=80, + in_channels=(512, 1024, 512, 256, 256, 256), + anchor_generator=AnchorGeneratorSSD().__dict__, + kernel_size=3, + padding=1, + use_sepconv=False, + conv_decay=0., + loss='SSDLoss'): + super(SSDHead, self).__init__() + # add background class + self.num_classes = num_classes + 1 + self.in_channels = in_channels + self.anchor_generator = anchor_generator + self.loss = loss + + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) + + self.num_priors = self.anchor_generator.num_priors + self.box_convs = [] + self.score_convs = [] + for i, num_prior in enumerate(self.num_priors): + box_conv_name = "boxes{}".format(i) + if not use_sepconv: + box_conv = self.add_sublayer( + box_conv_name, + nn.Conv2D( + in_channels=in_channels[i], + out_channels=num_prior * 4, + kernel_size=kernel_size, + padding=padding)) + else: + box_conv = self.add_sublayer( + box_conv_name, + SepConvLayer( + in_channels=in_channels[i], + out_channels=num_prior * 4, + kernel_size=kernel_size, + padding=padding, + conv_decay=conv_decay)) + self.box_convs.append(box_conv) + + score_conv_name = "scores{}".format(i) + if not use_sepconv: + score_conv = self.add_sublayer( + score_conv_name, + nn.Conv2D( + in_channels=in_channels[i], + out_channels=num_prior * self.num_classes, + kernel_size=kernel_size, + padding=padding)) + else: + score_conv = self.add_sublayer( + score_conv_name, + SepConvLayer( + in_channels=in_channels[i], + out_channels=num_prior * self.num_classes, + kernel_size=kernel_size, + padding=padding, + conv_decay=conv_decay)) + self.score_convs.append(score_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + def forward(self, feats, image, gt_bbox=None, gt_class=None): + box_preds = [] + cls_scores = [] + prior_boxes = [] + for feat, box_conv, score_conv in zip(feats, self.box_convs, + self.score_convs): + box_pred = box_conv(feat) + box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) + box_pred = paddle.reshape(box_pred, [0, -1, 4]) + box_preds.append(box_pred) + + cls_score = score_conv(feat) + cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) + cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) + cls_scores.append(cls_score) + + prior_boxes = self.anchor_generator(feats, image) + + if self.training: + return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, + prior_boxes) + else: + return (box_preds, cls_scores), prior_boxes + + def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): + return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) diff --git a/ppdet/modeling/heads/ttf_head.py b/ppdet/modeling/heads/ttf_head.py new file mode 100644 index 0000000..9e2eb6a --- /dev/null +++ b/ppdet/modeling/heads/ttf_head.py @@ -0,0 +1,284 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant, Uniform, Normal +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register +from ppdet.modeling.layers import DeformableConvV2, LiteConv +import numpy as np + + +@register +class HMHead(nn.Layer): + """ + Args: + ch_in (int): The channel number of input Tensor. + ch_out (int): The channel number of output Tensor. + num_classes (int): Number of classes. + conv_num (int): The convolution number of hm_feat. + dcn_head(bool): whether use dcn in head. False by default. + lite_head(bool): whether use lite version. False by default. + norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. + bn by default + + Return: + Heatmap head output + """ + __shared__ = ['num_classes', 'norm_type'] + + def __init__( + self, + ch_in, + ch_out=128, + num_classes=80, + conv_num=2, + dcn_head=False, + lite_head=False, + norm_type='bn', ): + super(HMHead, self).__init__() + head_conv = nn.Sequential() + for i in range(conv_num): + name = 'conv.{}'.format(i) + if lite_head: + lite_name = 'hm.' + name + head_conv.add_sublayer( + lite_name, + LiteConv( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + norm_type=norm_type)) + head_conv.add_sublayer(lite_name + '.act', nn.ReLU6()) + else: + if dcn_head: + head_conv.add_sublayer( + name, + DeformableConvV2( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + kernel_size=3, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) + else: + head_conv.add_sublayer( + name, + nn.Conv2D( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=ParamAttr( + learning_rate=2., regularizer=L2Decay(0.)))) + head_conv.add_sublayer(name + '.act', nn.ReLU()) + self.feat = head_conv + bias_init = float(-np.log((1 - 0.01) / 0.01)) + self.head = nn.Conv2D( + in_channels=ch_out, + out_channels=num_classes, + kernel_size=1, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=ParamAttr( + learning_rate=2., + regularizer=L2Decay(0.), + initializer=Constant(bias_init))) + + def forward(self, feat): + out = self.feat(feat) + out = self.head(out) + return out + + +@register +class WHHead(nn.Layer): + """ + Args: + ch_in (int): The channel number of input Tensor. + ch_out (int): The channel number of output Tensor. + conv_num (int): The convolution number of wh_feat. + dcn_head(bool): whether use dcn in head. False by default. + lite_head(bool): whether use lite version. False by default. + norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. + bn by default + Return: + Width & Height head output + """ + __shared__ = ['norm_type'] + + def __init__(self, + ch_in, + ch_out=64, + conv_num=2, + dcn_head=False, + lite_head=False, + norm_type='bn'): + super(WHHead, self).__init__() + head_conv = nn.Sequential() + for i in range(conv_num): + name = 'conv.{}'.format(i) + if lite_head: + lite_name = 'wh.' + name + head_conv.add_sublayer( + lite_name, + LiteConv( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + norm_type=norm_type)) + head_conv.add_sublayer(lite_name + '.act', nn.ReLU6()) + else: + if dcn_head: + head_conv.add_sublayer( + name, + DeformableConvV2( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + kernel_size=3, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) + else: + head_conv.add_sublayer( + name, + nn.Conv2D( + in_channels=ch_in if i == 0 else ch_out, + out_channels=ch_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=ParamAttr( + learning_rate=2., regularizer=L2Decay(0.)))) + head_conv.add_sublayer(name + '.act', nn.ReLU()) + + self.feat = head_conv + self.head = nn.Conv2D( + in_channels=ch_out, + out_channels=4, + kernel_size=1, + weight_attr=ParamAttr(initializer=Normal(0, 0.001)), + bias_attr=ParamAttr( + learning_rate=2., regularizer=L2Decay(0.))) + + def forward(self, feat): + out = self.feat(feat) + out = self.head(out) + out = F.relu(out) + return out + + +@register +class TTFHead(nn.Layer): + """ + TTFHead + Args: + in_channels (int): the channel number of input to TTFHead. + num_classes (int): the number of classes, 80 by default. + hm_head_planes (int): the channel number in heatmap head, + 128 by default. + wh_head_planes (int): the channel number in width & height head, + 64 by default. + hm_head_conv_num (int): the number of convolution in heatmap head, + 2 by default. + wh_head_conv_num (int): the number of convolution in width & height + head, 2 by default. + hm_loss (object): Instance of 'CTFocalLoss'. + wh_loss (object): Instance of 'GIoULoss'. + wh_offset_base (float): the base offset of width and height, + 16.0 by default. + down_ratio (int): the actual down_ratio is calculated by base_down_ratio + (default 16) and the number of upsample layers. + lite_head(bool): whether use lite version. False by default. + norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. + bn by default + """ + + __shared__ = ['num_classes', 'down_ratio', 'norm_type'] + __inject__ = ['hm_loss', 'wh_loss'] + + def __init__(self, + in_channels, + num_classes=80, + hm_head_planes=128, + wh_head_planes=64, + hm_head_conv_num=2, + wh_head_conv_num=2, + hm_loss='CTFocalLoss', + wh_loss='GIoULoss', + wh_offset_base=16., + down_ratio=4, + dcn_head=False, + lite_head=False, + norm_type='bn'): + super(TTFHead, self).__init__() + self.in_channels = in_channels + self.hm_head = HMHead(in_channels, hm_head_planes, num_classes, + hm_head_conv_num, dcn_head, lite_head, norm_type) + self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num, + dcn_head, lite_head, norm_type) + self.hm_loss = hm_loss + self.wh_loss = wh_loss + + self.wh_offset_base = wh_offset_base + self.down_ratio = down_ratio + + @classmethod + def from_config(cls, cfg, input_shape): + if isinstance(input_shape, (list, tuple)): + input_shape = input_shape[0] + return {'in_channels': input_shape.channels, } + + def forward(self, feats): + hm = self.hm_head(feats) + wh = self.wh_head(feats) * self.wh_offset_base + return hm, wh + + def filter_box_by_weight(self, pred, target, weight): + """ + Filter out boxes where ttf_reg_weight is 0, only keep positive samples. + """ + index = paddle.nonzero(weight > 0) + index.stop_gradient = True + weight = paddle.gather_nd(weight, index) + pred = paddle.gather_nd(pred, index) + target = paddle.gather_nd(target, index) + return pred, target, weight + + def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight): + pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4) + hm_loss = self.hm_loss(pred_hm, target_hm) + H, W = target_hm.shape[2:] + mask = paddle.reshape(target_weight, [-1, H, W]) + avg_factor = paddle.sum(mask) + 1e-4 + + base_step = self.down_ratio + shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32') + shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32') + shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x]) + base_loc = paddle.stack([shift_x, shift_y], axis=0) + base_loc.stop_gradient = True + + pred_boxes = paddle.concat( + [0 - pred_wh[:, 0:2, :, :] + base_loc, pred_wh[:, 2:4] + base_loc], + axis=1) + pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1]) + boxes = paddle.transpose(box_target, [0, 2, 3, 1]) + boxes.stop_gradient = True + + pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes, + mask) + mask.stop_gradient = True + wh_loss = self.wh_loss(pred_boxes, boxes, iou_weight=mask.unsqueeze(1)) + wh_loss = wh_loss / avg_factor + + ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss} + return ttf_loss diff --git a/ppdet/modeling/heads/yolo_head.py b/ppdet/modeling/heads/yolo_head.py new file mode 100644 index 0000000..a081774 --- /dev/null +++ b/ppdet/modeling/heads/yolo_head.py @@ -0,0 +1,123 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register + + +def _de_sigmoid(x, eps=1e-7): + x = paddle.clip(x, eps, 1. / eps) + x = paddle.clip(1. / x - 1., eps, 1. / eps) + x = -paddle.log(x) + return x + + +@register +class YOLOv3Head(nn.Layer): + __shared__ = ['num_classes', 'data_format'] + __inject__ = ['loss'] + + def __init__(self, + in_channels=[1024, 512, 256], + anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]], + anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], + num_classes=80, + loss='YOLOv3Loss', + iou_aware=False, + iou_aware_factor=0.4, + data_format='NCHW'): + """ + Head for YOLOv3 network + + Args: + num_classes (int): number of foreground classes + anchors (list): anchors + anchor_masks (list): anchor masks + loss (object): YOLOv3Loss instance + iou_aware (bool): whether to use iou_aware + iou_aware_factor (float): iou aware factor + data_format (str): data format, NCHW or NHWC + """ + super(YOLOv3Head, self).__init__() + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels + self.num_classes = num_classes + self.loss = loss + + self.iou_aware = iou_aware + self.iou_aware_factor = iou_aware_factor + + self.parse_anchor(anchors, anchor_masks) + self.num_outputs = len(self.anchors) + self.data_format = data_format + + self.yolo_outputs = [] + for i in range(len(self.anchors)): + + if self.iou_aware: + num_filters = len(self.anchors[i]) * (self.num_classes + 6) + else: + num_filters = len(self.anchors[i]) * (self.num_classes + 5) + name = 'yolo_output.{}'.format(i) + conv = nn.Conv2D( + in_channels=self.in_channels[i], + out_channels=num_filters, + kernel_size=1, + stride=1, + padding=0, + data_format=data_format, + bias_attr=ParamAttr(regularizer=L2Decay(0.))) + yolo_output = self.add_sublayer(name, conv) + self.yolo_outputs.append(yolo_output) + + def parse_anchor(self, anchors, anchor_masks): + self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks] + self.mask_anchors = [] + anchor_num = len(anchors) + for masks in anchor_masks: + self.mask_anchors.append([]) + for mask in masks: + assert mask < anchor_num, "anchor mask index overflow" + self.mask_anchors[-1].extend(anchors[mask]) + + def forward(self, feats, targets=None): + assert len(feats) == len(self.anchors) + yolo_outputs = [] + for i, feat in enumerate(feats): + yolo_output = self.yolo_outputs[i](feat) + if self.data_format == 'NHWC': + yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2]) + yolo_outputs.append(yolo_output) + + if self.training: + return self.loss(yolo_outputs, targets, self.anchors) + else: + if self.iou_aware: + y = [] + for i, out in enumerate(yolo_outputs): + na = len(self.anchors[i]) + ioup, x = out[:, 0:na, :, :], out[:, na:, :, :] + b, c, h, w = x.shape + no = c // na + x = x.reshape((b, na, no, h * w)) + ioup = ioup.reshape((b, na, 1, h * w)) + obj = x[:, :, 4:5, :] + ioup = F.sigmoid(ioup) + obj = F.sigmoid(obj) + obj_t = (obj**(1 - self.iou_aware_factor)) * ( + ioup**self.iou_aware_factor) + obj_t = _de_sigmoid(obj_t) + loc_t = x[:, :, :4, :] + cls_t = x[:, :, 5:, :] + y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2) + y_t = y_t.reshape((b, c, h, w)) + y.append(y_t) + return y + else: + return yolo_outputs + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } diff --git a/ppdet/modeling/layers.py b/ppdet/modeling/layers.py new file mode 100644 index 0000000..5877b5f --- /dev/null +++ b/ppdet/modeling/layers.py @@ -0,0 +1,945 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import six +import numpy as np +from numbers import Integral + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle import to_tensor +from paddle.nn import Conv2D, BatchNorm2D, GroupNorm +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, Constant, XavierUniform +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, serializable +from ppdet.modeling.bbox_utils import delta2bbox +from . import ops + +from paddle.vision.ops import DeformConv2D + + +def _to_list(l): + if isinstance(l, (list, tuple)): + return list(l) + return [l] + + +class DeformableConvV2(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None, + lr_scale=1, + regularizer=None): + super(DeformableConvV2, self).__init__() + self.offset_channel = 2 * kernel_size**2 + self.mask_channel = kernel_size**2 + + if lr_scale == 1 and regularizer is None: + offset_bias_attr = ParamAttr(initializer=Constant(0.)) + else: + offset_bias_attr = ParamAttr( + initializer=Constant(0.), + learning_rate=lr_scale, + regularizer=regularizer) + self.conv_offset = nn.Conv2D( + in_channels, + 3 * kernel_size**2, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.0)), + bias_attr=offset_bias_attr) + + if bias_attr: + # in FCOS-DCN head, specifically need learning_rate and regularizer + dcn_bias_attr = ParamAttr( + initializer=Constant(value=0), + regularizer=L2Decay(0.), + learning_rate=2.) + else: + # in ResNet backbone, do not need bias + dcn_bias_attr = False + self.conv_dcn = DeformConv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 * dilation, + dilation=dilation, + groups=groups, + weight_attr=weight_attr, + bias_attr=dcn_bias_attr) + + def forward(self, x): + offset_mask = self.conv_offset(x) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + y = self.conv_dcn(x, offset, mask=mask) + return y + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + use_dcn=False, + bias_on=False, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01)): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn'] + + if bias_on: + bias_attr = ParamAttr( + initializer=Constant(value=0.), learning_rate=lr_scale) + else: + bias_attr = False + + if not use_dcn: + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=bias_attr) + else: + # in FCOS-DCN head, specifically need learning_rate and regularizer + self.conv = DeformableConvV2( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=True, + lr_scale=2., + regularizer=L2Decay(norm_decay)) + + norm_lr = 0. if freeze_norm else 1. + param_attr = ParamAttr( + learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) + bias_attr = ParamAttr( + learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) + if norm_type == 'bn': + self.norm = nn.BatchNorm2D( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'sync_bn': + self.norm = nn.SyncBatchNorm( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + return out + + +class LiteConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride=1, + with_act=True, + norm_type='sync_bn', + name=None): + super(LiteConv, self).__init__() + self.lite_conv = nn.Sequential() + conv1 = ConvNormLayer( + in_channels, + in_channels, + filter_size=5, + stride=stride, + groups=in_channels, + norm_type=norm_type, + initializer=XavierUniform()) + conv2 = ConvNormLayer( + in_channels, + out_channels, + filter_size=1, + stride=stride, + norm_type=norm_type, + initializer=XavierUniform()) + conv3 = ConvNormLayer( + out_channels, + out_channels, + filter_size=1, + stride=stride, + norm_type=norm_type, + initializer=XavierUniform()) + conv4 = ConvNormLayer( + out_channels, + out_channels, + filter_size=5, + stride=stride, + groups=out_channels, + norm_type=norm_type, + initializer=XavierUniform()) + conv_list = [conv1, conv2, conv3, conv4] + self.lite_conv.add_sublayer('conv1', conv1) + self.lite_conv.add_sublayer('relu6_1', nn.ReLU6()) + self.lite_conv.add_sublayer('conv2', conv2) + if with_act: + self.lite_conv.add_sublayer('relu6_2', nn.ReLU6()) + self.lite_conv.add_sublayer('conv3', conv3) + self.lite_conv.add_sublayer('relu6_3', nn.ReLU6()) + self.lite_conv.add_sublayer('conv4', conv4) + if with_act: + self.lite_conv.add_sublayer('relu6_4', nn.ReLU6()) + + def forward(self, inputs): + out = self.lite_conv(inputs) + return out + + +@register +@serializable +class AnchorGeneratorRPN(object): + def __init__(self, + anchor_sizes=[32, 64, 128, 256, 512], + aspect_ratios=[0.5, 1.0, 2.0], + stride=[16.0, 16.0], + variance=[1.0, 1.0, 1.0, 1.0], + anchor_start_size=None): + super(AnchorGeneratorRPN, self).__init__() + self.anchor_sizes = anchor_sizes + self.aspect_ratios = aspect_ratios + self.stride = stride + self.variance = variance + self.anchor_start_size = anchor_start_size + + def __call__(self, input, level=None): + anchor_sizes = self.anchor_sizes if ( + level is None or self.anchor_start_size is None) else ( + self.anchor_start_size * 2**level) + stride = self.stride if ( + level is None or self.anchor_start_size is None) else ( + self.stride[0] * (2.**level), self.stride[1] * (2.**level)) + anchor, var = ops.anchor_generator( + input=input, + anchor_sizes=anchor_sizes, + aspect_ratios=self.aspect_ratios, + stride=stride, + variance=self.variance) + return anchor, var + + +@register +@serializable +class AnchorGeneratorSSD(object): + def __init__(self, + steps=[8, 16, 32, 64, 100, 300], + aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]], + min_ratio=15, + max_ratio=90, + base_size=300, + min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0], + max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0], + offset=0.5, + flip=True, + clip=False, + min_max_aspect_ratios_order=False): + self.steps = steps + self.aspect_ratios = aspect_ratios + self.min_ratio = min_ratio + self.max_ratio = max_ratio + self.base_size = base_size + self.min_sizes = min_sizes + self.max_sizes = max_sizes + self.offset = offset + self.flip = flip + self.clip = clip + self.min_max_aspect_ratios_order = min_max_aspect_ratios_order + + if self.min_sizes == [] and self.max_sizes == []: + num_layer = len(aspect_ratios) + step = int( + math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2 + ))) + for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1, + step): + self.min_sizes.append(self.base_size * ratio / 100.) + self.max_sizes.append(self.base_size * (ratio + step) / 100.) + self.min_sizes = [self.base_size * .10] + self.min_sizes + self.max_sizes = [self.base_size * .20] + self.max_sizes + + self.num_priors = [] + for aspect_ratio, min_size, max_size in zip( + aspect_ratios, self.min_sizes, self.max_sizes): + if isinstance(min_size, (list, tuple)): + self.num_priors.append( + len(_to_list(min_size)) + len(_to_list(max_size))) + else: + self.num_priors.append((len(aspect_ratio) * 2 + 1) * len( + _to_list(min_size)) + len(_to_list(max_size))) + + def __call__(self, inputs, image): + boxes = [] + for input, min_size, max_size, aspect_ratio, step in zip( + inputs, self.min_sizes, self.max_sizes, self.aspect_ratios, + self.steps): + box, _ = ops.prior_box( + input=input, + image=image, + min_sizes=_to_list(min_size), + max_sizes=_to_list(max_size), + aspect_ratios=aspect_ratio, + flip=self.flip, + clip=self.clip, + steps=[step, step], + offset=self.offset, + min_max_aspect_ratios_order=self.min_max_aspect_ratios_order) + boxes.append(paddle.reshape(box, [-1, 4])) + return boxes + + +@register +@serializable +class RCNNBox(object): + __shared__ = ['num_classes'] + + def __init__(self, + prior_box_var=[10., 10., 5., 5.], + code_type="decode_center_size", + box_normalized=False, + num_classes=80): + super(RCNNBox, self).__init__() + self.prior_box_var = prior_box_var + self.code_type = code_type + self.box_normalized = box_normalized + self.num_classes = num_classes + + def __call__(self, bbox_head_out, rois, im_shape, scale_factor): + bbox_pred, cls_prob = bbox_head_out + roi, rois_num = rois + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + scale_list = [] + origin_shape_list = [] + for idx, roi_per_im in enumerate(roi): + rois_num_per_im = rois_num[idx] + expand_im_shape = paddle.expand(im_shape[idx, :], + [rois_num_per_im, 2]) + origin_shape_list.append(expand_im_shape) + + origin_shape = paddle.concat(origin_shape_list) + + # bbox_pred.shape: [N, C*4] + # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head) + bbox = paddle.concat(roi) + if bbox.shape[0] == 0: + bbox = paddle.zeros([0, bbox_pred.shape[1]], dtype='float32') + else: + bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) + scores = cls_prob[:, :-1] + + # bbox.shape: [N, C, 4] + # bbox.shape[1] must be equal to scores.shape[1] + bbox_num_class = bbox.shape[1] + if bbox_num_class == 1: + bbox = paddle.tile(bbox, [1, self.num_classes, 1]) + + origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1) + origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1) + zeros = paddle.zeros_like(origin_h) + x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) + y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) + x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) + y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) + bbox = paddle.stack([x1, y1, x2, y2], axis=-1) + bboxes = (bbox, rois_num) + return bboxes, scores + + +@register +@serializable +class MultiClassNMS(object): + def __init__(self, + score_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + nms_threshold=.5, + normalized=True, + nms_eta=1.0, + return_rois_num=True): + super(MultiClassNMS, self).__init__() + self.score_threshold = score_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.nms_threshold = nms_threshold + self.normalized = normalized + self.nms_eta = nms_eta + self.return_rois_num = return_rois_num + + def __call__(self, bboxes, score, background_label=-1): + """ + bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape + [N, M, 4], N is the batch size and M + is the number of bboxes + 2. (List[Tensor]) bboxes and bbox_num, + bboxes have shape of [M, C, 4], C + is the class number and bbox_num means + the number of bboxes of each batch with + shape [N,] + score (Tensor): Predicted scores with shape [N, C, M] or [M, C] + background_label (int): Ignore the background label; For example, RCNN + is num_classes and YOLO is -1. + """ + kwargs = self.__dict__.copy() + if isinstance(bboxes, tuple): + bboxes, bbox_num = bboxes + kwargs.update({'rois_num': bbox_num}) + if background_label > -1: + kwargs.update({'background_label': background_label}) + return ops.multiclass_nms(bboxes, score, **kwargs) + + +@register +@serializable +class MatrixNMS(object): + __append_doc__ = True + + def __init__(self, + score_threshold=.05, + post_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + use_gaussian=False, + gaussian_sigma=2., + normalized=False, + background_label=0): + super(MatrixNMS, self).__init__() + self.score_threshold = score_threshold + self.post_threshold = post_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.normalized = normalized + self.use_gaussian = use_gaussian + self.gaussian_sigma = gaussian_sigma + self.background_label = background_label + + def __call__(self, bbox, score, *args): + return ops.matrix_nms( + bboxes=bbox, + scores=score, + score_threshold=self.score_threshold, + post_threshold=self.post_threshold, + nms_top_k=self.nms_top_k, + keep_top_k=self.keep_top_k, + use_gaussian=self.use_gaussian, + gaussian_sigma=self.gaussian_sigma, + background_label=self.background_label, + normalized=self.normalized) + + +@register +@serializable +class YOLOBox(object): + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=80, + conf_thresh=0.005, + downsample_ratio=32, + clip_bbox=True, + scale_x_y=1.): + self.num_classes = num_classes + self.conf_thresh = conf_thresh + self.downsample_ratio = downsample_ratio + self.clip_bbox = clip_bbox + self.scale_x_y = scale_x_y + + def __call__(self, + yolo_head_out, + anchors, + im_shape, + scale_factor, + var_weight=None): + boxes_list = [] + scores_list = [] + origin_shape = im_shape / scale_factor + origin_shape = paddle.cast(origin_shape, 'int32') + for i, head_out in enumerate(yolo_head_out): + boxes, scores = ops.yolo_box(head_out, origin_shape, anchors[i], + self.num_classes, self.conf_thresh, + self.downsample_ratio // 2**i, + self.clip_bbox, self.scale_x_y) + boxes_list.append(boxes) + scores_list.append(paddle.transpose(scores, perm=[0, 2, 1])) + yolo_boxes = paddle.concat(boxes_list, axis=1) + yolo_scores = paddle.concat(scores_list, axis=2) + return yolo_boxes, yolo_scores + + +@register +@serializable +class SSDBox(object): + def __init__(self, is_normalized=True): + self.is_normalized = is_normalized + self.norm_delta = float(not self.is_normalized) + + def __call__(self, + preds, + prior_boxes, + im_shape, + scale_factor, + var_weight=None): + boxes, scores = preds + outputs = [] + for box, score, prior_box in zip(boxes, scores, prior_boxes): + pb_w = prior_box[:, 2] - prior_box[:, 0] + self.norm_delta + pb_h = prior_box[:, 3] - prior_box[:, 1] + self.norm_delta + pb_x = prior_box[:, 0] + pb_w * 0.5 + pb_y = prior_box[:, 1] + pb_h * 0.5 + out_x = pb_x + box[:, :, 0] * pb_w * 0.1 + out_y = pb_y + box[:, :, 1] * pb_h * 0.1 + out_w = paddle.exp(box[:, :, 2] * 0.2) * pb_w + out_h = paddle.exp(box[:, :, 3] * 0.2) * pb_h + + if self.is_normalized: + h = paddle.unsqueeze( + im_shape[:, 0] / scale_factor[:, 0], axis=-1) + w = paddle.unsqueeze( + im_shape[:, 1] / scale_factor[:, 1], axis=-1) + output = paddle.stack( + [(out_x - out_w / 2.) * w, (out_y - out_h / 2.) * h, + (out_x + out_w / 2.) * w, (out_y + out_h / 2.) * h], + axis=-1) + else: + output = paddle.stack( + [ + out_x - out_w / 2., out_y - out_h / 2., + out_x + out_w / 2. - 1., out_y + out_h / 2. - 1. + ], + axis=-1) + outputs.append(output) + boxes = paddle.concat(outputs, axis=1) + + scores = F.softmax(paddle.concat(scores, axis=1)) + scores = paddle.transpose(scores, [0, 2, 1]) + + return boxes, scores + + +@register +@serializable +class AnchorGrid(object): + """Generate anchor grid + + Args: + image_size (int or list): input image size, may be a single integer or + list of [h, w]. Default: 512 + min_level (int): min level of the feature pyramid. Default: 3 + max_level (int): max level of the feature pyramid. Default: 7 + anchor_base_scale: base anchor scale. Default: 4 + num_scales: number of anchor scales. Default: 3 + aspect_ratios: aspect ratios. default: [[1, 1], [1.4, 0.7], [0.7, 1.4]] + """ + + def __init__(self, + image_size=512, + min_level=3, + max_level=7, + anchor_base_scale=4, + num_scales=3, + aspect_ratios=[[1, 1], [1.4, 0.7], [0.7, 1.4]]): + super(AnchorGrid, self).__init__() + if isinstance(image_size, Integral): + self.image_size = [image_size, image_size] + else: + self.image_size = image_size + for dim in self.image_size: + assert dim % 2 ** max_level == 0, \ + "image size should be multiple of the max level stride" + self.min_level = min_level + self.max_level = max_level + self.anchor_base_scale = anchor_base_scale + self.num_scales = num_scales + self.aspect_ratios = aspect_ratios + + @property + def base_cell(self): + if not hasattr(self, '_base_cell'): + self._base_cell = self.make_cell() + return self._base_cell + + def make_cell(self): + scales = [2**(i / self.num_scales) for i in range(self.num_scales)] + scales = np.array(scales) + ratios = np.array(self.aspect_ratios) + ws = np.outer(scales, ratios[:, 0]).reshape(-1, 1) + hs = np.outer(scales, ratios[:, 1]).reshape(-1, 1) + anchors = np.hstack((-0.5 * ws, -0.5 * hs, 0.5 * ws, 0.5 * hs)) + return anchors + + def make_grid(self, stride): + cell = self.base_cell * stride * self.anchor_base_scale + x_steps = np.arange(stride // 2, self.image_size[1], stride) + y_steps = np.arange(stride // 2, self.image_size[0], stride) + offset_x, offset_y = np.meshgrid(x_steps, y_steps) + offset_x = offset_x.flatten() + offset_y = offset_y.flatten() + offsets = np.stack((offset_x, offset_y, offset_x, offset_y), axis=-1) + offsets = offsets[:, np.newaxis, :] + return (cell + offsets).reshape(-1, 4) + + def generate(self): + return [ + self.make_grid(2**l) + for l in range(self.min_level, self.max_level + 1) + ] + + def __call__(self): + if not hasattr(self, '_anchor_vars'): + anchor_vars = [] + helper = LayerHelper('anchor_grid') + for idx, l in enumerate(range(self.min_level, self.max_level + 1)): + stride = 2**l + anchors = self.make_grid(stride) + var = helper.create_parameter( + attr=ParamAttr(name='anchors_{}'.format(idx)), + shape=anchors.shape, + dtype='float32', + stop_gradient=True, + default_initializer=NumpyArrayInitializer(anchors)) + anchor_vars.append(var) + var.persistable = True + self._anchor_vars = anchor_vars + + return self._anchor_vars + + +@register +@serializable +class FCOSBox(object): + __shared__ = ['num_classes'] + + def __init__(self, num_classes=80): + super(FCOSBox, self).__init__() + self.num_classes = num_classes + + def _merge_hw(self, inputs, ch_type="channel_first"): + """ + Merge h and w of the feature map into one dimension. + Args: + inputs (Tensor): Tensor of the input feature map + ch_type (str): "channel_first" or "channel_last" style + Return: + new_shape (Tensor): The new shape after h and w merged + """ + shape_ = paddle.shape(inputs) + bs, ch, hi, wi = shape_[0], shape_[1], shape_[2], shape_[3] + img_size = hi * wi + img_size.stop_gradient = True + if ch_type == "channel_first": + new_shape = paddle.concat([bs, ch, img_size]) + elif ch_type == "channel_last": + new_shape = paddle.concat([bs, img_size, ch]) + else: + raise KeyError("Wrong ch_type %s" % ch_type) + new_shape.stop_gradient = True + return new_shape + + def _postprocessing_by_level(self, locations, box_cls, box_reg, box_ctn, + scale_factor): + """ + Postprocess each layer of the output with corresponding locations. + Args: + locations (Tensor): anchor points for current layer, [H*W, 2] + box_cls (Tensor): categories prediction, [N, C, H, W], + C is the number of classes + box_reg (Tensor): bounding box prediction, [N, 4, H, W] + box_ctn (Tensor): centerness prediction, [N, 1, H, W] + scale_factor (Tensor): [h_scale, w_scale] for input images + Return: + box_cls_ch_last (Tensor): score for each category, in [N, C, M] + C is the number of classes and M is the number of anchor points + box_reg_decoding (Tensor): decoded bounding box, in [N, M, 4] + last dimension is [x1, y1, x2, y2] + """ + act_shape_cls = self._merge_hw(box_cls) + box_cls_ch_last = paddle.reshape(x=box_cls, shape=act_shape_cls) + box_cls_ch_last = F.sigmoid(box_cls_ch_last) + + act_shape_reg = self._merge_hw(box_reg) + box_reg_ch_last = paddle.reshape(x=box_reg, shape=act_shape_reg) + box_reg_ch_last = paddle.transpose(box_reg_ch_last, perm=[0, 2, 1]) + box_reg_decoding = paddle.stack( + [ + locations[:, 0] - box_reg_ch_last[:, :, 0], + locations[:, 1] - box_reg_ch_last[:, :, 1], + locations[:, 0] + box_reg_ch_last[:, :, 2], + locations[:, 1] + box_reg_ch_last[:, :, 3] + ], + axis=1) + box_reg_decoding = paddle.transpose(box_reg_decoding, perm=[0, 2, 1]) + + act_shape_ctn = self._merge_hw(box_ctn) + box_ctn_ch_last = paddle.reshape(x=box_ctn, shape=act_shape_ctn) + box_ctn_ch_last = F.sigmoid(box_ctn_ch_last) + + # recover the location to original image + im_scale = paddle.concat([scale_factor, scale_factor], axis=1) + box_reg_decoding = box_reg_decoding / im_scale + box_cls_ch_last = box_cls_ch_last * box_ctn_ch_last + return box_cls_ch_last, box_reg_decoding + + def __call__(self, locations, cls_logits, bboxes_reg, centerness, + scale_factor): + pred_boxes_ = [] + pred_scores_ = [] + for pts, cls, box, ctn in zip(locations, cls_logits, bboxes_reg, + centerness): + pred_scores_lvl, pred_boxes_lvl = self._postprocessing_by_level( + pts, cls, box, ctn, scale_factor) + pred_boxes_.append(pred_boxes_lvl) + pred_scores_.append(pred_scores_lvl) + pred_boxes = paddle.concat(pred_boxes_, axis=1) + pred_scores = paddle.concat(pred_scores_, axis=2) + return pred_boxes, pred_scores + + +@register +class TTFBox(object): + __shared__ = ['down_ratio'] + + def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4): + super(TTFBox, self).__init__() + self.max_per_img = max_per_img + self.score_thresh = score_thresh + self.down_ratio = down_ratio + + def _simple_nms(self, heat, kernel=3): + """ + Use maxpool to filter the max score, get local peaks. + """ + pad = (kernel - 1) // 2 + hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) + keep = paddle.cast(hmax == heat, 'float32') + return heat * keep + + def _topk(self, scores): + """ + Select top k scores and decode to get xy coordinates. + """ + k = self.max_per_img + shape_fm = paddle.shape(scores) + shape_fm.stop_gradient = True + cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] + # batch size is 1 + scores_r = paddle.reshape(scores, [cat, -1]) + topk_scores, topk_inds = paddle.topk(scores_r, k) + topk_scores, topk_inds = paddle.topk(scores_r, k) + topk_ys = topk_inds // width + topk_xs = topk_inds % width + + topk_score_r = paddle.reshape(topk_scores, [-1]) + topk_score, topk_ind = paddle.topk(topk_score_r, k) + k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64') + topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') + + topk_inds = paddle.reshape(topk_inds, [-1]) + topk_ys = paddle.reshape(topk_ys, [-1, 1]) + topk_xs = paddle.reshape(topk_xs, [-1, 1]) + topk_inds = paddle.gather(topk_inds, topk_ind) + topk_ys = paddle.gather(topk_ys, topk_ind) + topk_xs = paddle.gather(topk_xs, topk_ind) + + return topk_score, topk_inds, topk_clses, topk_ys, topk_xs + + def __call__(self, hm, wh, im_shape, scale_factor): + heatmap = F.sigmoid(hm) + heat = self._simple_nms(heatmap) + scores, inds, clses, ys, xs = self._topk(heat) + ys = paddle.cast(ys, 'float32') * self.down_ratio + xs = paddle.cast(xs, 'float32') * self.down_ratio + scores = paddle.tensor.unsqueeze(scores, [1]) + clses = paddle.tensor.unsqueeze(clses, [1]) + + wh_t = paddle.transpose(wh, [0, 2, 3, 1]) + wh = paddle.reshape(wh_t, [-1, paddle.shape(wh_t)[-1]]) + wh = paddle.gather(wh, inds) + + x1 = xs - wh[:, 0:1] + y1 = ys - wh[:, 1:2] + x2 = xs + wh[:, 2:3] + y2 = ys + wh[:, 3:4] + + bboxes = paddle.concat([x1, y1, x2, y2], axis=1) + + scale_y = scale_factor[:, 0:1] + scale_x = scale_factor[:, 1:2] + scale_expand = paddle.concat( + [scale_x, scale_y, scale_x, scale_y], axis=1) + boxes_shape = paddle.shape(bboxes) + boxes_shape.stop_gradient = True + scale_expand = paddle.expand(scale_expand, shape=boxes_shape) + bboxes = paddle.divide(bboxes, scale_expand) + results = paddle.concat([clses, scores, bboxes], axis=1) + # hack: append result with cls=-1 and score=1. to avoid all scores + # are less than score_thresh which may cause error in gather. + fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]])) + fill_r = paddle.cast(fill_r, results.dtype) + results = paddle.concat([results, fill_r]) + scores = results[:, 1] + valid_ind = paddle.nonzero(scores > self.score_thresh) + results = paddle.gather(results, valid_ind) + return results, paddle.shape(results)[0:1] + + +@register +@serializable +class MaskMatrixNMS(object): + """ + Matrix NMS for multi-class masks. + Args: + update_threshold (float): Updated threshold of categroy score in second time. + pre_nms_top_n (int): Number of total instance to be kept per image before NMS + post_nms_top_n (int): Number of total instance to be kept per image after NMS. + kernel (str): 'linear' or 'gaussian'. + sigma (float): std in gaussian method. + Input: + seg_preds (Variable): shape (n, h, w), segmentation feature maps + seg_masks (Variable): shape (n, h, w), segmentation feature maps + cate_labels (Variable): shape (n), mask labels in descending order + cate_scores (Variable): shape (n), mask scores in descending order + sum_masks (Variable): a float tensor of the sum of seg_masks + Returns: + Variable: cate_scores, tensors of shape (n) + """ + + def __init__(self, + update_threshold=0.05, + pre_nms_top_n=500, + post_nms_top_n=100, + kernel='gaussian', + sigma=2.0): + super(MaskMatrixNMS, self).__init__() + self.update_threshold = update_threshold + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.kernel = kernel + self.sigma = sigma + + def _sort_score(self, scores, top_num): + if paddle.shape(scores)[0] > top_num: + return paddle.topk(scores, top_num)[1] + else: + return paddle.argsort(scores, descending=True) + + def __call__(self, + seg_preds, + seg_masks, + cate_labels, + cate_scores, + sum_masks=None): + # sort and keep top nms_pre + sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) + seg_masks = paddle.gather(seg_masks, index=sort_inds) + seg_preds = paddle.gather(seg_preds, index=sort_inds) + sum_masks = paddle.gather(sum_masks, index=sort_inds) + cate_scores = paddle.gather(cate_scores, index=sort_inds) + cate_labels = paddle.gather(cate_labels, index=sort_inds) + + seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) + # inter. + inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) + n_samples = paddle.shape(cate_labels) + # union. + sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) + # iou. + iou_matrix = (inter_matrix / ( + sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) + iou_matrix = paddle.triu(iou_matrix, diagonal=1) + # label_specific matrix. + cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) + label_matrix = paddle.cast( + (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), + 'float32') + label_matrix = paddle.triu(label_matrix, diagonal=1) + + # IoU compensation + compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) + compensate_iou = paddle.expand( + compensate_iou, shape=[n_samples, n_samples]) + compensate_iou = paddle.transpose(compensate_iou, [1, 0]) + + # IoU decay + decay_iou = iou_matrix * label_matrix + + # matrix nms + if self.kernel == 'gaussian': + decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) + compensate_matrix = paddle.exp(-1 * self.sigma * + (compensate_iou**2)) + decay_coefficient = paddle.min(decay_matrix / compensate_matrix, + axis=0) + elif self.kernel == 'linear': + decay_matrix = (1 - decay_iou) / (1 - compensate_iou) + decay_coefficient = paddle.min(decay_matrix, axis=0) + else: + raise NotImplementedError + + # update the score. + cate_scores = cate_scores * decay_coefficient + y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') + keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, + y) + keep = paddle.nonzero(keep) + keep = paddle.squeeze(keep, axis=[1]) + # Prevent empty and increase fake data + keep = paddle.concat( + [keep, paddle.cast(paddle.shape(cate_scores)[0] - 1, 'int64')]) + + seg_preds = paddle.gather(seg_preds, index=keep) + cate_scores = paddle.gather(cate_scores, index=keep) + cate_labels = paddle.gather(cate_labels, index=keep) + + # sort and keep top_k + sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) + seg_preds = paddle.gather(seg_preds, index=sort_inds) + cate_scores = paddle.gather(cate_scores, index=sort_inds) + cate_labels = paddle.gather(cate_labels, index=sort_inds) + return seg_preds, cate_scores, cate_labels diff --git a/ppdet/modeling/losses/__init__.py b/ppdet/modeling/losses/__init__.py new file mode 100644 index 0000000..7a38168 --- /dev/null +++ b/ppdet/modeling/losses/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import yolo_loss +from . import iou_aware_loss +from . import iou_loss +from . import ssd_loss +from . import fcos_loss +from . import solov2_loss +from . import ctfocal_loss + +from .yolo_loss import * +from .iou_aware_loss import * +from .iou_loss import * +from .ssd_loss import * +from .fcos_loss import * +from .solov2_loss import * +from .ctfocal_loss import * diff --git a/ppdet/modeling/losses/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..816760b Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..a4ec660 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-38.pyc new file mode 100644 index 0000000..b59950c Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-39.pyc new file mode 100644 index 0000000..d11e667 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/ctfocal_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-38.pyc new file mode 100644 index 0000000..51470fd Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-39.pyc new file mode 100644 index 0000000..9da9309 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/fcos_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-38.pyc new file mode 100644 index 0000000..18c7196 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-39.pyc new file mode 100644 index 0000000..657f427 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/iou_aware_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/iou_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/iou_loss.cpython-38.pyc new file mode 100644 index 0000000..9945058 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/iou_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/iou_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/iou_loss.cpython-39.pyc new file mode 100644 index 0000000..01c909b Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/iou_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-38.pyc new file mode 100644 index 0000000..f2ba1a1 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-39.pyc new file mode 100644 index 0000000..81911cf Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/solov2_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-38.pyc new file mode 100644 index 0000000..9726dc1 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-39.pyc new file mode 100644 index 0000000..bfbb5f7 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/ssd_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-38.pyc b/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-38.pyc new file mode 100644 index 0000000..5768f38 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-38.pyc differ diff --git a/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-39.pyc b/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-39.pyc new file mode 100644 index 0000000..3a29c85 Binary files /dev/null and b/ppdet/modeling/losses/__pycache__/yolo_loss.cpython-39.pyc differ diff --git a/ppdet/modeling/losses/ctfocal_loss.py b/ppdet/modeling/losses/ctfocal_loss.py new file mode 100644 index 0000000..455fed4 --- /dev/null +++ b/ppdet/modeling/losses/ctfocal_loss.py @@ -0,0 +1,67 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable + +__all__ = ['CTFocalLoss'] + + +@register +@serializable +class CTFocalLoss(object): + """ + CTFocalLoss + Args: + loss_weight (float): loss weight + gamma (float): gamma parameter for Focal Loss + """ + + def __init__(self, loss_weight=1., gamma=2.0): + self.loss_weight = loss_weight + self.gamma = gamma + + def __call__(self, pred, target): + """ + Calculate the loss + Args: + pred(Tensor): heatmap prediction + target(Tensor): target for positive samples + Return: + ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet. + Note that the values in target are in [0, 1] since gaussian is + used to reduce the punishment and we treat [0, 1) as neg example. + """ + fg_map = paddle.cast(target == 1, 'float32') + fg_map.stop_gradient = True + bg_map = paddle.cast(target < 1, 'float32') + bg_map.stop_gradient = True + + neg_weights = paddle.pow(1 - target, 4) * bg_map + pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred, + self.gamma) * fg_map + neg_loss = 0 - paddle.log(1 - pred) * paddle.pow( + pred, self.gamma) * neg_weights + pos_loss = paddle.sum(pos_loss) + neg_loss = paddle.sum(neg_loss) + + fg_num = paddle.sum(fg_map) + ct_focal_loss = (pos_loss + neg_loss) / ( + fg_num + paddle.cast(fg_num == 0, 'float32')) + return ct_focal_loss * self.loss_weight diff --git a/ppdet/modeling/losses/fcos_loss.py b/ppdet/modeling/losses/fcos_loss.py new file mode 100644 index 0000000..201786c --- /dev/null +++ b/ppdet/modeling/losses/fcos_loss.py @@ -0,0 +1,226 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from ppdet.modeling import ops + +INF = 1e8 +__all__ = ['FCOSLoss'] + + +def flatten_tensor(inputs, channel_first=False): + """ + Flatten a Tensor + Args: + inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C] + channel_first(bool): if true the dimension order of + Tensor is [N, C, H, W], otherwise is [N, H, W, C] + Return: + input_channel_last (Tensor): The flattened Tensor in channel_last style + """ + if channel_first: + input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1]) + else: + input_channel_last = inputs + output_channel_last = paddle.flatten( + input_channel_last, start_axis=0, stop_axis=2) # [N*H*W, C] + return output_channel_last + + +@register +class FCOSLoss(nn.Layer): + """ + FCOSLoss + Args: + loss_alpha (float): alpha in focal loss + loss_gamma (float): gamma in focal loss + iou_loss_type(str): location loss type, IoU/GIoU/LINEAR_IoU + reg_weights(float): weight for location loss + """ + + def __init__(self, + loss_alpha=0.25, + loss_gamma=2.0, + iou_loss_type="giou", + reg_weights=1.0): + super(FCOSLoss, self).__init__() + self.loss_alpha = loss_alpha + self.loss_gamma = loss_gamma + self.iou_loss_type = iou_loss_type + self.reg_weights = reg_weights + + def __iou_loss(self, pred, targets, positive_mask, weights=None): + """ + Calculate the loss for location prediction + Args: + pred (Tensor): bounding boxes prediction + targets (Tensor): targets for positive samples + positive_mask (Tensor): mask of positive samples + weights (Tensor): weights for each positive samples + Return: + loss (Tensor): location loss + """ + plw = pred[:, 0] * positive_mask + pth = pred[:, 1] * positive_mask + prw = pred[:, 2] * positive_mask + pbh = pred[:, 3] * positive_mask + + tlw = targets[:, 0] * positive_mask + tth = targets[:, 1] * positive_mask + trw = targets[:, 2] * positive_mask + tbh = targets[:, 3] * positive_mask + tlw.stop_gradient = True + trw.stop_gradient = True + tth.stop_gradient = True + tbh.stop_gradient = True + + ilw = paddle.minimum(plw, tlw) + irw = paddle.minimum(prw, trw) + ith = paddle.minimum(pth, tth) + ibh = paddle.minimum(pbh, tbh) + + clw = paddle.maximum(plw, tlw) + crw = paddle.maximum(prw, trw) + cth = paddle.maximum(pth, tth) + cbh = paddle.maximum(pbh, tbh) + + area_predict = (plw + prw) * (pth + pbh) + area_target = (tlw + trw) * (tth + tbh) + area_inter = (ilw + irw) * (ith + ibh) + ious = (area_inter + 1.0) / ( + area_predict + area_target - area_inter + 1.0) + ious = ious * positive_mask + + if self.iou_loss_type.lower() == "linear_iou": + loss = 1.0 - ious + elif self.iou_loss_type.lower() == "giou": + area_uniou = area_predict + area_target - area_inter + area_circum = (clw + crw) * (cth + cbh) + 1e-7 + giou = ious - (area_circum - area_uniou) / area_circum + loss = 1.0 - giou + elif self.iou_loss_type.lower() == "iou": + loss = 0.0 - paddle.log(ious) + else: + raise KeyError + if weights is not None: + loss = loss * weights + return loss + + def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, + tag_bboxes, tag_center): + """ + Calculate the loss for classification, location and centerness + Args: + cls_logits (list): list of Tensor, which is predicted + score for all anchor points with shape [N, M, C] + bboxes_reg (list): list of Tensor, which is predicted + offsets for all anchor points with shape [N, M, 4] + centerness (list): list of Tensor, which is predicted + centerness for all anchor points with shape [N, M, 1] + tag_labels (list): list of Tensor, which is category + targets for each anchor point + tag_bboxes (list): list of Tensor, which is bounding + boxes targets for positive samples + tag_center (list): list of Tensor, which is centerness + targets for positive samples + Return: + loss (dict): loss composed by classification loss, bounding box + """ + cls_logits_flatten_list = [] + bboxes_reg_flatten_list = [] + centerness_flatten_list = [] + tag_labels_flatten_list = [] + tag_bboxes_flatten_list = [] + tag_center_flatten_list = [] + num_lvl = len(cls_logits) + for lvl in range(num_lvl): + cls_logits_flatten_list.append( + flatten_tensor(cls_logits[lvl], True)) + bboxes_reg_flatten_list.append( + flatten_tensor(bboxes_reg[lvl], True)) + centerness_flatten_list.append( + flatten_tensor(centerness[lvl], True)) + + tag_labels_flatten_list.append( + flatten_tensor(tag_labels[lvl], False)) + tag_bboxes_flatten_list.append( + flatten_tensor(tag_bboxes[lvl], False)) + tag_center_flatten_list.append( + flatten_tensor(tag_center[lvl], False)) + + cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) + bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) + centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) + + tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) + tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) + tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) + tag_labels_flatten.stop_gradient = True + tag_bboxes_flatten.stop_gradient = True + tag_center_flatten.stop_gradient = True + + mask_positive_bool = tag_labels_flatten > 0 + mask_positive_bool.stop_gradient = True + mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") + mask_positive_float.stop_gradient = True + + num_positive_fp32 = paddle.sum(mask_positive_float) + num_positive_fp32.stop_gradient = True + num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") + num_positive_int32 = num_positive_int32 * 0 + 1 + num_positive_int32.stop_gradient = True + + normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) + normalize_sum.stop_gradient = True + + # 1. cls_logits: sigmoid_focal_loss + # expand onehot labels + num_classes = cls_logits_flatten.shape[-1] + tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1) + tag_labels_flatten_bin = F.one_hot( + tag_labels_flatten, num_classes=1 + num_classes) + tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:] + # sigmoid_focal_loss + cls_loss = F.sigmoid_focal_loss( + cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32 + + # 2. bboxes_reg: giou_loss + mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) + tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) + reg_loss = self.__iou_loss( + bboxes_reg_flatten, + tag_bboxes_flatten, + mask_positive_float, + weights=tag_center_flatten) + reg_loss = reg_loss * mask_positive_float / normalize_sum + + # 3. centerness: sigmoid_cross_entropy_with_logits_loss + centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) + ctn_loss = ops.sigmoid_cross_entropy_with_logits(centerness_flatten, + tag_center_flatten) + ctn_loss = ctn_loss * mask_positive_float / num_positive_fp32 + + loss_all = { + "loss_centerness": paddle.sum(ctn_loss), + "loss_cls": paddle.sum(cls_loss), + "loss_box": paddle.sum(reg_loss) + } + return loss_all diff --git a/ppdet/modeling/losses/iou_aware_loss.py b/ppdet/modeling/losses/iou_aware_loss.py new file mode 100644 index 0000000..1e6aa8b --- /dev/null +++ b/ppdet/modeling/losses/iou_aware_loss.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from .iou_loss import IouLoss +from ..bbox_utils import xywh2xyxy, bbox_iou + + +@register +@serializable +class IouAwareLoss(IouLoss): + """ + iou aware loss, see https://arxiv.org/abs/1912.05992 + Args: + loss_weight (float): iou aware loss weight, default is 1.0 + max_height (int): max height of input to support random shape input + max_width (int): max width of input to support random shape input + """ + + def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False): + super(IouAwareLoss, self).__init__( + loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou) + + def __call__(self, ioup, pbox, gbox): + iou = bbox_iou( + pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) + iou.stop_gradient = True + loss_iou_aware = F.binary_cross_entropy_with_logits( + ioup, iou, reduction='none') + loss_iou_aware = loss_iou_aware * self.loss_weight + return loss_iou_aware diff --git a/ppdet/modeling/losses/iou_loss.py b/ppdet/modeling/losses/iou_loss.py new file mode 100644 index 0000000..3ac857b --- /dev/null +++ b/ppdet/modeling/losses/iou_loss.py @@ -0,0 +1,204 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import paddle +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from ..bbox_utils import xywh2xyxy, bbox_iou + +__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss'] + + +@register +@serializable +class IouLoss(object): + """ + iou loss, see https://arxiv.org/abs/1908.03851 + loss = 1.0 - iou * iou + Args: + loss_weight (float): iou loss weight, default is 2.5 + max_height (int): max height of input to support random shape input + max_width (int): max width of input to support random shape input + ciou_term (bool): whether to add ciou_term + loss_square (bool): whether to square the iou term + """ + + def __init__(self, + loss_weight=2.5, + giou=False, + diou=False, + ciou=False, + loss_square=True): + self.loss_weight = loss_weight + self.giou = giou + self.diou = diou + self.ciou = ciou + self.loss_square = loss_square + + def __call__(self, pbox, gbox): + iou = bbox_iou( + pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) + if self.loss_square: + loss_iou = 1 - iou * iou + else: + loss_iou = 1 - iou + + loss_iou = loss_iou * self.loss_weight + return loss_iou + + +@register +@serializable +class GIoULoss(object): + """ + Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630 + Args: + loss_weight (float): giou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + reduction (string): Options are "none", "mean" and "sum". default as none + """ + + def __init__(self, loss_weight=1., eps=1e-10, reduction='none'): + self.loss_weight = loss_weight + self.eps = eps + assert reduction in ('none', 'mean', 'sum') + self.reduction = reduction + + def bbox_overlap(self, box1, box2, eps=1e-10): + """calculate the iou of box1 and box2 + Args: + box1 (Tensor): box1 with the shape (..., 4) + box2 (Tensor): box1 with the shape (..., 4) + eps (float): epsilon to avoid divide by zero + Return: + iou (Tensor): iou of box1 and box2 + overlap (Tensor): overlap of box1 and box2 + union (Tensor): union of box1 and box2 + """ + x1, y1, x2, y2 = box1 + x1g, y1g, x2g, y2g = box2 + + xkis1 = paddle.maximum(x1, x1g) + ykis1 = paddle.maximum(y1, y1g) + xkis2 = paddle.minimum(x2, x2g) + ykis2 = paddle.minimum(y2, y2g) + w_inter = (xkis2 - xkis1).clip(0) + h_inter = (ykis2 - ykis1).clip(0) + overlap = w_inter * h_inter + + area1 = (x2 - x1) * (y2 - y1) + area2 = (x2g - x1g) * (y2g - y1g) + union = area1 + area2 - overlap + eps + iou = overlap / union + + return iou, overlap, union + + def __call__(self, pbox, gbox, iou_weight=1.): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + box1 = [x1, y1, x2, y2] + box2 = [x1g, y1g, x2g, y2g] + iou, overlap, union = self.bbox_overlap(box1, box2, self.eps) + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps + miou = iou - ((area_c - union) / area_c) + giou = 1 - miou + if self.reduction == 'none': + loss = giou + elif self.reduction == 'sum': + loss = paddle.sum(giou * iou_weight) + else: + loss = paddle.mean(giou * iou_weight) + return loss * self.loss_weight + + +@register +@serializable +class DIouLoss(GIoULoss): + """ + Distance-IoU Loss, see https://arxiv.org/abs/1911.08287 + Args: + loss_weight (float): giou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + use_complete_iou_loss (bool): whether to use complete iou loss + """ + + def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True): + super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps) + self.use_complete_iou_loss = use_complete_iou_loss + + def __call__(self, pbox, gbox, iou_weight=1.): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = x2 - x1 + h = y2 - y1 + + cxg = (x1g + x2g) / 2 + cyg = (y1g + y2g) / 2 + wg = x2g - x1g + hg = y2g - y1g + + x2 = paddle.maximum(x1, x2) + y2 = paddle.maximum(y1, y2) + + # A and B + xkis1 = paddle.maximum(x1, x1g) + ykis1 = paddle.maximum(y1, y1g) + xkis2 = paddle.minimum(x2, x2g) + ykis2 = paddle.minimum(y2, y2g) + + # A or B + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) + intsctk = intsctk * paddle.greater_than( + xkis2, xkis1) * paddle.greater_than(ykis2, ykis1) + unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g + ) - intsctk + self.eps + iouk = intsctk / unionk + + # DIOU term + dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg) + dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1) + diou_term = (dist_intersection + self.eps) / (dist_union + self.eps) + + # CIOU term + ciou_term = 0 + if self.use_complete_iou_loss: + ar_gt = wg / hg + ar_pred = w / h + arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred) + ar_loss = 4. / np.pi / np.pi * arctan * arctan + alpha = ar_loss / (1 - iouk + ar_loss + self.eps) + alpha.stop_gradient = True + ciou_term = alpha * ar_loss + + diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight) + + return diou * self.loss_weight diff --git a/ppdet/modeling/losses/solov2_loss.py b/ppdet/modeling/losses/solov2_loss.py new file mode 100644 index 0000000..ef97a77 --- /dev/null +++ b/ppdet/modeling/losses/solov2_loss.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable + +__all__ = ['SOLOv2Loss'] + + +@register +@serializable +class SOLOv2Loss(object): + """ + SOLOv2Loss + Args: + ins_loss_weight (float): Weight of instance loss. + focal_loss_gamma (float): Gamma parameter for focal loss. + focal_loss_alpha (float): Alpha parameter for focal loss. + """ + + def __init__(self, + ins_loss_weight=3.0, + focal_loss_gamma=2.0, + focal_loss_alpha=0.25): + self.ins_loss_weight = ins_loss_weight + self.focal_loss_gamma = focal_loss_gamma + self.focal_loss_alpha = focal_loss_alpha + + def _dice_loss(self, input, target): + input = paddle.reshape(input, shape=(paddle.shape(input)[0], -1)) + target = paddle.reshape(target, shape=(paddle.shape(target)[0], -1)) + a = paddle.sum(input * target, axis=1) + b = paddle.sum(input * input, axis=1) + 0.001 + c = paddle.sum(target * target, axis=1) + 0.001 + d = (2 * a) / (b + c) + return 1 - d + + def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels, + num_ins): + """ + Get loss of network of SOLOv2. + Args: + ins_pred_list (list): Variable list of instance branch output. + ins_label_list (list): List of instance labels pre batch. + cate_preds (list): Concat Variable list of categroy branch output. + cate_labels (list): Concat list of categroy labels pre batch. + num_ins (int): Number of positive samples in a mini-batch. + Returns: + loss_ins (Variable): The instance loss Variable of SOLOv2 network. + loss_cate (Variable): The category loss Variable of SOLOv2 network. + """ + + #1. Ues dice_loss to calculate instance loss + loss_ins = [] + total_weights = paddle.zeros(shape=[1], dtype='float32') + for input, target in zip(ins_pred_list, ins_label_list): + if input is None: + continue + target = paddle.cast(target, 'float32') + target = paddle.reshape( + target, + shape=[-1, paddle.shape(input)[-2], paddle.shape(input)[-1]]) + weights = paddle.cast( + paddle.sum(target, axis=[1, 2]) > 0, 'float32') + input = F.sigmoid(input) + dice_out = paddle.multiply(self._dice_loss(input, target), weights) + total_weights += paddle.sum(weights) + loss_ins.append(dice_out) + loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights + loss_ins = loss_ins * self.ins_loss_weight + + #2. Ues sigmoid_focal_loss to calculate category loss + # expand onehot labels + num_classes = cate_preds.shape[-1] + cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1) + cate_labels_bin = cate_labels_bin[:, 1:] + + loss_cate = F.sigmoid_focal_loss( + cate_preds, + label=cate_labels_bin, + normalizer=num_ins + 1., + gamma=self.focal_loss_gamma, + alpha=self.focal_loss_alpha) + + return loss_ins, loss_cate diff --git a/ppdet/modeling/losses/ssd_loss.py b/ppdet/modeling/losses/ssd_loss.py new file mode 100644 index 0000000..0b68f31 --- /dev/null +++ b/ppdet/modeling/losses/ssd_loss.py @@ -0,0 +1,162 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from ..ops import iou_similarity +from ..bbox_utils import bbox2delta + +__all__ = ['SSDLoss'] + + +@register +class SSDLoss(nn.Layer): + """ + SSDLoss + + Args: + overlap_threshold (float32, optional): IoU threshold for negative bboxes + and positive bboxes, 0.5 by default. + neg_pos_ratio (float): The ratio of negative samples / positive samples. + loc_loss_weight (float): The weight of loc_loss. + conf_loss_weight (float): The weight of conf_loss. + prior_box_var (list): Variances corresponding to prior box coord, [0.1, + 0.1, 0.2, 0.2] by default. + """ + + def __init__(self, + overlap_threshold=0.5, + neg_pos_ratio=3.0, + loc_loss_weight=1.0, + conf_loss_weight=1.0, + prior_box_var=[0.1, 0.1, 0.2, 0.2]): + super(SSDLoss, self).__init__() + self.overlap_threshold = overlap_threshold + self.neg_pos_ratio = neg_pos_ratio + self.loc_loss_weight = loc_loss_weight + self.conf_loss_weight = conf_loss_weight + self.prior_box_var = [1. / a for a in prior_box_var] + + def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes, + bg_index): + """ + Args: + gt_bbox (Tensor): [B, N, 4] + gt_label (Tensor): [B, N, 1] + prior_boxes (Tensor): [A, 4] + bg_index (int): Background class index + """ + batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0] + ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape( + (batch_size, -1, num_priors)) + + # Calculate the number of object per sample. + num_object = (ious.sum(axis=-1) > 0).astype('int64').sum(axis=-1) + + # For each prior box, get the max IoU of all GTs. + prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1) + # For each GT, get the max IoU of all prior boxes. + gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2) + + # Gather target bbox and label according to 'prior_argmax_iou' index. + batch_ind = paddle.arange( + 0, batch_size, dtype='int64').unsqueeze(-1).tile([1, num_priors]) + prior_argmax_iou = paddle.stack([batch_ind, prior_argmax_iou], axis=-1) + targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou) + targets_label = paddle.gather_nd(gt_label, prior_argmax_iou) + # Assign negative + bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index, + 'int64') + targets_label = paddle.where( + prior_max_iou.unsqueeze(-1) < self.overlap_threshold, + bg_index_tensor, targets_label) + + # Ensure each GT can match the max IoU prior box. + for i in range(batch_size): + if num_object[i] > 0: + targets_bbox[i] = paddle.scatter( + targets_bbox[i], gt_argmax_iou[i, :int(num_object[i])], + gt_bbox[i, :int(num_object[i])]) + targets_label[i] = paddle.scatter( + targets_label[i], gt_argmax_iou[i, :int(num_object[i])], + gt_label[i, :int(num_object[i])]) + + # Encode box + prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1]) + targets_bbox = bbox2delta( + prior_boxes.reshape([-1, 4]), + targets_bbox.reshape([-1, 4]), self.prior_box_var) + targets_bbox = targets_bbox.reshape([batch_size, -1, 4]) + + return targets_bbox, targets_label + + def _mine_hard_example(self, conf_loss, targets_label, bg_index): + pos = (targets_label != bg_index).astype(conf_loss.dtype) + num_pos = pos.sum(axis=1, keepdim=True) + neg = (targets_label == bg_index).astype(conf_loss.dtype) + + conf_loss = conf_loss.clone() * neg + loss_idx = conf_loss.argsort(axis=1, descending=True) + idx_rank = loss_idx.argsort(axis=1) + num_negs = [] + for i in range(conf_loss.shape[0]): + cur_num_pos = num_pos[i] + num_neg = paddle.clip( + cur_num_pos * self.neg_pos_ratio, max=pos.shape[1]) + num_negs.append(num_neg) + num_neg = paddle.stack(num_negs).expand_as(idx_rank) + neg_mask = (idx_rank < num_neg).astype(conf_loss.dtype) + + return (neg_mask + pos).astype('bool') + + def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes): + boxes = paddle.concat(boxes, axis=1) + scores = paddle.concat(scores, axis=1) + gt_label = gt_label.unsqueeze(-1).astype('int64') + prior_boxes = paddle.concat(prior_boxes, axis=0) + bg_index = scores.shape[-1] - 1 + + # Match bbox and get targets. + targets_bbox, targets_label = \ + self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index) + targets_bbox.stop_gradient = True + targets_label.stop_gradient = True + + # Compute regression loss. + # Select positive samples. + bbox_mask = (targets_label != bg_index).astype(boxes.dtype) + loc_loss = bbox_mask * F.smooth_l1_loss( + boxes, targets_bbox, reduction='none') + loc_loss = loc_loss.sum() * self.loc_loss_weight + + # Compute confidence loss. + conf_loss = F.softmax_with_cross_entropy(scores, targets_label) + # Mining hard examples. + label_mask = self._mine_hard_example( + conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index) + conf_loss = conf_loss * label_mask.unsqueeze(-1).astype(conf_loss.dtype) + conf_loss = conf_loss.sum() * self.conf_loss_weight + + # Compute overall weighted loss. + normalizer = (targets_label != bg_index).astype('float32').sum().clip( + min=1) + loss = (conf_loss + loc_loss) / (normalizer + 1e-9) + + return loss diff --git a/ppdet/modeling/losses/yolo_loss.py b/ppdet/modeling/losses/yolo_loss.py new file mode 100644 index 0000000..657959c --- /dev/null +++ b/ppdet/modeling/losses/yolo_loss.py @@ -0,0 +1,206 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register + +from ..bbox_utils import decode_yolo, xywh2xyxy, iou_similarity + +__all__ = ['YOLOv3Loss'] + + +def bbox_transform(pbox, anchor, downsample): + pbox = decode_yolo(pbox, anchor, downsample) + pbox = xywh2xyxy(pbox) + return pbox + + +@register +class YOLOv3Loss(nn.Layer): + + __inject__ = ['iou_loss', 'iou_aware_loss'] + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=80, + ignore_thresh=0.7, + label_smooth=False, + downsample=[32, 16, 8], + scale_x_y=1., + iou_loss=None, + iou_aware_loss=None): + """ + YOLOv3Loss layer + + Args: + num_calsses (int): number of foreground classes + ignore_thresh (float): threshold to ignore confidence loss + label_smooth (bool): whether to use label smoothing + downsample (list): downsample ratio for each detection block + scale_x_y (float): scale_x_y factor + iou_loss (object): IoULoss instance + iou_aware_loss (object): IouAwareLoss instance + """ + super(YOLOv3Loss, self).__init__() + self.num_classes = num_classes + self.ignore_thresh = ignore_thresh + self.label_smooth = label_smooth + self.downsample = downsample + self.scale_x_y = scale_x_y + self.iou_loss = iou_loss + self.iou_aware_loss = iou_aware_loss + self.distill_pairs = [] + + def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample): + # pbox + pbox = decode_yolo(pbox, anchor, downsample) + pbox = xywh2xyxy(pbox) + pbox = paddle.concat(pbox, axis=-1) + b = pbox.shape[0] + pbox = pbox.reshape((b, -1, 4)) + # gbox + gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5 + gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5 + gbox = paddle.concat([gxy, gwh], axis=-1) + + iou = iou_similarity(pbox, gbox) + iou.stop_gradient = True + iou_max = iou.max(2) # [N, M1] + iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype) + iou_mask.stop_gradient = True + + pobj = pobj.reshape((b, -1)) + tobj = tobj.reshape((b, -1)) + obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype) + obj_mask.stop_gradient = True + + loss_obj = F.binary_cross_entropy_with_logits( + pobj, obj_mask, reduction='none') + loss_obj_pos = (loss_obj * tobj) + loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask) + return loss_obj_pos + loss_obj_neg + + def cls_loss(self, pcls, tcls): + if self.label_smooth: + delta = min(1. / self.num_classes, 1. / 40) + pos, neg = 1 - delta, delta + # 1 for positive, 0 for negative + tcls = pos * paddle.cast( + tcls > 0., dtype=tcls.dtype) + neg * paddle.cast( + tcls <= 0., dtype=tcls.dtype) + + loss_cls = F.binary_cross_entropy_with_logits( + pcls, tcls, reduction='none') + return loss_cls + + def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1., + eps=1e-10): + na = len(anchor) + b, c, h, w = p.shape + if self.iou_aware_loss: + ioup, p = p[:, 0:na, :, :], p[:, na:, :, :] + ioup = ioup.unsqueeze(-1) + p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2)) + x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2] + w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4] + obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:] + self.distill_pairs.append([x, y, w, h, obj, pcls]) + + t = t.transpose((0, 1, 3, 4, 2)) + tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2] + tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4] + tscale = t[:, :, :, :, 4:5] + tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:] + + tscale_obj = tscale * tobj + loss = dict() + + x = scale * F.sigmoid(x) - 0.5 * (scale - 1.) + y = scale * F.sigmoid(y) - 0.5 * (scale - 1.) + + if abs(scale - 1.) < eps: + loss_x = F.binary_cross_entropy(x, tx, reduction='none') + loss_y = F.binary_cross_entropy(y, ty, reduction='none') + loss_xy = tscale_obj * (loss_x + loss_y) + else: + loss_x = paddle.abs(x - tx) + loss_y = paddle.abs(y - ty) + loss_xy = tscale_obj * (loss_x + loss_y) + + loss_xy = loss_xy.sum([1, 2, 3, 4]).mean() + + loss_w = paddle.abs(w - tw) + loss_h = paddle.abs(h - th) + loss_wh = tscale_obj * (loss_w + loss_h) + loss_wh = loss_wh.sum([1, 2, 3, 4]).mean() + + loss['loss_xy'] = loss_xy + loss['loss_wh'] = loss_wh + + if self.iou_loss is not None: + # warn: do not modify x, y, w, h in place + box, tbox = [x, y, w, h], [tx, ty, tw, th] + pbox = bbox_transform(box, anchor, downsample) + gbox = bbox_transform(tbox, anchor, downsample) + loss_iou = self.iou_loss(pbox, gbox) + loss_iou = loss_iou * tscale_obj + loss_iou = loss_iou.sum([1, 2, 3, 4]).mean() + loss['loss_iou'] = loss_iou + + if self.iou_aware_loss is not None: + box, tbox = [x, y, w, h], [tx, ty, tw, th] + pbox = bbox_transform(box, anchor, downsample) + gbox = bbox_transform(tbox, anchor, downsample) + loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox) + loss_iou_aware = loss_iou_aware * tobj + loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean() + loss['loss_iou_aware'] = loss_iou_aware + + box = [x, y, w, h] + loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample) + loss_obj = loss_obj.sum(-1).mean() + loss['loss_obj'] = loss_obj + loss_cls = self.cls_loss(pcls, tcls) * tobj + loss_cls = loss_cls.sum([1, 2, 3, 4]).mean() + loss['loss_cls'] = loss_cls + return loss + + def forward(self, inputs, targets, anchors): + np = len(inputs) + gt_targets = [targets['target{}'.format(i)] for i in range(np)] + gt_box = targets['gt_bbox'] + yolo_losses = dict() + self.distill_pairs.clear() + for x, t, anchor, downsample in zip(inputs, gt_targets, anchors, + self.downsample): + yolo_loss = self.yolov3_loss(x, t, gt_box, anchor, downsample, + self.scale_x_y) + for k, v in yolo_loss.items(): + if k in yolo_losses: + yolo_losses[k] += v + else: + yolo_losses[k] = v + + loss = 0 + for k, v in yolo_losses.items(): + loss += v + + yolo_losses['loss'] = loss + return yolo_losses diff --git a/ppdet/modeling/necks/__init__.py b/ppdet/modeling/necks/__init__.py new file mode 100644 index 0000000..9a0f150 --- /dev/null +++ b/ppdet/modeling/necks/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import fpn +from . import yolo_fpn +from . import hrfpn +from . import ttf_fpn + +from .fpn import * +from .yolo_fpn import * +from .hrfpn import * +from .ttf_fpn import * diff --git a/ppdet/modeling/necks/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/necks/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..e4c1c24 Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/necks/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..2aaf16a Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/fpn.cpython-38.pyc b/ppdet/modeling/necks/__pycache__/fpn.cpython-38.pyc new file mode 100644 index 0000000..b8bf0f0 Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/fpn.cpython-38.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/fpn.cpython-39.pyc b/ppdet/modeling/necks/__pycache__/fpn.cpython-39.pyc new file mode 100644 index 0000000..bf9ab2f Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/fpn.cpython-39.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/hrfpn.cpython-38.pyc b/ppdet/modeling/necks/__pycache__/hrfpn.cpython-38.pyc new file mode 100644 index 0000000..a2b80d8 Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/hrfpn.cpython-38.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/hrfpn.cpython-39.pyc b/ppdet/modeling/necks/__pycache__/hrfpn.cpython-39.pyc new file mode 100644 index 0000000..d913e8f Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/hrfpn.cpython-39.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-38.pyc b/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-38.pyc new file mode 100644 index 0000000..a99a4f2 Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-38.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-39.pyc b/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-39.pyc new file mode 100644 index 0000000..991e50d Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/ttf_fpn.cpython-39.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-38.pyc b/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-38.pyc new file mode 100644 index 0000000..061be7e Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-38.pyc differ diff --git a/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-39.pyc b/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-39.pyc new file mode 100644 index 0000000..edfacd7 Binary files /dev/null and b/ppdet/modeling/necks/__pycache__/yolo_fpn.cpython-39.pyc differ diff --git a/ppdet/modeling/necks/fpn.py b/ppdet/modeling/necks/fpn.py new file mode 100644 index 0000000..867b7dc --- /dev/null +++ b/ppdet/modeling/necks/fpn.py @@ -0,0 +1,233 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import XavierUniform +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from ppdet.modeling.layers import ConvNormLayer +from ..shape_spec import ShapeSpec + +__all__ = ['FPN'] + + +@register +@serializable +class FPN(nn.Layer): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channel (list[int]): output channel of each level + spatial_scales (list[float]): the spatial scales between input feature + maps and original input image which can be derived from the output + shape of backbone by from_config + has_extra_convs (bool): whether to add extra conv to the last level. + default False + extra_stage (int): the number of extra stages added to the last level. + default 1 + use_c5 (bool): Whether to use c5 as the input of extra stage, + otherwise p5 is used. default True + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if + norm_type is string, bn, gn, sync_bn are available. default None + norm_decay (float): weight decay for normalization layer weights. + default 0. + freeze_norm (bool): whether to freeze normalization layer. + default False + relu_before_extra_convs (bool): whether to add relu before extra convs. + default False + + """ + + def __init__(self, + in_channels, + out_channel, + spatial_scales=[0.25, 0.125, 0.0625, 0.03125], + has_extra_convs=False, + extra_stage=1, + use_c5=True, + norm_type=None, + norm_decay=0., + freeze_norm=False, + relu_before_extra_convs=True): + super(FPN, self).__init__() + self.out_channel = out_channel + for s in range(extra_stage): + spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] + self.spatial_scales = spatial_scales + self.has_extra_convs = has_extra_convs + self.extra_stage = extra_stage + self.use_c5 = use_c5 + self.relu_before_extra_convs = relu_before_extra_convs + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + + self.lateral_convs = [] + self.fpn_convs = [] + fan = out_channel * 3 * 3 + + # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone + # 0 <= st_stage < ed_stage <= 3 + st_stage = 4 - len(in_channels) + ed_stage = st_stage + len(in_channels) - 1 + for i in range(st_stage, ed_stage + 1): + if i == 3: + lateral_name = 'fpn_inner_res5_sum' + else: + lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) + in_c = in_channels[i - st_stage] + if self.norm_type is not None: + lateral = self.add_sublayer( + lateral_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channel, + filter_size=1, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=in_c))) + else: + lateral = self.add_sublayer( + lateral_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channel, + kernel_size=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=in_c)))) + self.lateral_convs.append(lateral) + + fpn_name = 'fpn_res{}_sum'.format(i + 2) + if self.norm_type is not None: + fpn_conv = self.add_sublayer( + fpn_name, + ConvNormLayer( + ch_in=out_channel, + ch_out=out_channel, + filter_size=3, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + fpn_conv = self.add_sublayer( + fpn_name, + nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=3, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(fpn_conv) + + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + if self.has_extra_convs: + for i in range(self.extra_stage): + lvl = ed_stage + 1 + i + if i == 0 and self.use_c5: + in_c = in_channels[-1] + else: + in_c = out_channel + extra_fpn_name = 'fpn_{}'.format(lvl + 2) + if self.norm_type is not None: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channel, + filter_size=3, + stride=2, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channel, + kernel_size=3, + stride=2, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(extra_fpn_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + def forward(self, body_feats): + laterals = [] + num_levels = len(body_feats) + for i in range(num_levels): + laterals.append(self.lateral_convs[i](body_feats[i])) + + for i in range(1, num_levels): + lvl = num_levels - i + upsample = F.interpolate( + laterals[lvl], + scale_factor=2., + mode='nearest', ) + laterals[lvl - 1] += upsample + + fpn_output = [] + for lvl in range(num_levels): + fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) + + if self.extra_stage > 0: + # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) + if not self.has_extra_convs: + assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' + fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + else: + if self.use_c5: + extra_source = body_feats[-1] + else: + extra_source = fpn_output[-1] + fpn_output.append(self.fpn_convs[num_levels](extra_source)) + + for i in range(1, self.extra_stage): + if self.relu_before_extra_convs: + fpn_output.append(self.fpn_convs[num_levels + i](F.relu( + fpn_output[-1]))) + else: + fpn_output.append(self.fpn_convs[num_levels + i]( + fpn_output[-1])) + return fpn_output + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self.out_channel, stride=1. / s) + for s in self.spatial_scales + ] diff --git a/ppdet/modeling/necks/hrfpn.py b/ppdet/modeling/necks/hrfpn.py new file mode 100644 index 0000000..4b737c9 --- /dev/null +++ b/ppdet/modeling/necks/hrfpn.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +from paddle import ParamAttr +import paddle.nn as nn +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['HRFPN'] + + +@register +class HRFPN(nn.Layer): + """ + Args: + in_channels (list): number of input feature channels from backbone + out_channel (int): number of output feature channels + share_conv (bool): whether to share conv for different layers' reduction + extra_stage (int): add extra stage for returning HRFPN fpn_feats + spatial_scales (list): feature map scaling factor + """ + + def __init__(self, + in_channels=[18, 36, 72, 144], + out_channel=256, + share_conv=False, + extra_stage=1, + spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32]): + super(HRFPN, self).__init__() + in_channel = sum(in_channels) + self.in_channel = in_channel + self.out_channel = out_channel + self.share_conv = share_conv + for i in range(extra_stage): + spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] + self.spatial_scales = spatial_scales + self.num_out = len(self.spatial_scales) + + self.reduction = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=1, + weight_attr=ParamAttr(name='hrfpn_reduction_weights'), + bias_attr=False) + + if share_conv: + self.fpn_conv = nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(name='fpn_conv_weights'), + bias_attr=False) + else: + self.fpn_conv = [] + for i in range(self.num_out): + conv_name = "fpn_conv_" + str(i) + conv = self.add_sublayer( + conv_name, + nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(name=conv_name + "_weights"), + bias_attr=False)) + self.fpn_conv.append(conv) + + def forward(self, body_feats): + num_backbone_stages = len(body_feats) + + outs = [] + outs.append(body_feats[0]) + + # resize + for i in range(1, num_backbone_stages): + resized = F.interpolate( + body_feats[i], scale_factor=2**i, mode='bilinear') + outs.append(resized) + + # concat + out = paddle.concat(outs, axis=1) + assert out.shape[ + 1] == self.in_channel, 'in_channel should be {}, be received {}'.format( + out.shape[1], self.in_channel) + + # reduction + out = self.reduction(out) + + # conv + outs = [out] + for i in range(1, self.num_out): + outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i)) + outputs = [] + + for i in range(self.num_out): + conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i] + conv = conv_func(outs[i]) + outputs.append(conv) + + fpn_feats = [outputs[k] for k in range(self.num_out)] + return fpn_feats + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self.out_channel, stride=1. / s) + for s in self.spatial_scales + ] diff --git a/ppdet/modeling/necks/ttf_fpn.py b/ppdet/modeling/necks/ttf_fpn.py new file mode 100644 index 0000000..9c7f392 --- /dev/null +++ b/ppdet/modeling/necks/ttf_fpn.py @@ -0,0 +1,243 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform +from paddle import ParamAttr +from ppdet.core.workspace import register, serializable +from paddle.regularizer import L2Decay +from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv +import math +from ppdet.modeling.ops import batch_norm +from ..shape_spec import ShapeSpec + +__all__ = ['TTFFPN'] + + +class Upsample(nn.Layer): + def __init__(self, ch_in, ch_out, norm_type='bn'): + super(Upsample, self).__init__() + fan_in = ch_in * 3 * 3 + stdv = 1. / math.sqrt(fan_in) + self.dcn = DeformableConvV2( + ch_in, + ch_out, + kernel_size=3, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr( + initializer=Constant(0), + regularizer=L2Decay(0.), + learning_rate=2.), + lr_scale=2., + regularizer=L2Decay(0.)) + + self.bn = batch_norm( + ch_out, norm_type=norm_type, initializer=Constant(1.)) + + def forward(self, feat): + dcn = self.dcn(feat) + bn = self.bn(dcn) + relu = F.relu(bn) + out = F.interpolate(relu, scale_factor=2., mode='bilinear') + return out + + +class DeConv(nn.Layer): + def __init__(self, ch_in, ch_out, norm_type='bn'): + super(DeConv, self).__init__() + self.deconv = nn.Sequential() + conv1 = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + stride=1, + filter_size=1, + norm_type=norm_type, + initializer=XavierUniform()) + conv2 = nn.Conv2DTranspose( + in_channels=ch_out, + out_channels=ch_out, + kernel_size=4, + padding=1, + stride=2, + groups=ch_out, + weight_attr=ParamAttr(initializer=XavierUniform()), + bias_attr=False) + bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.) + conv3 = ConvNormLayer( + ch_in=ch_out, + ch_out=ch_out, + stride=1, + filter_size=1, + norm_type=norm_type, + initializer=XavierUniform()) + + self.deconv.add_sublayer('conv1', conv1) + self.deconv.add_sublayer('relu6_1', nn.ReLU6()) + self.deconv.add_sublayer('conv2', conv2) + self.deconv.add_sublayer('bn', bn) + self.deconv.add_sublayer('relu6_2', nn.ReLU6()) + self.deconv.add_sublayer('conv3', conv3) + self.deconv.add_sublayer('relu6_3', nn.ReLU6()) + + def forward(self, inputs): + return self.deconv(inputs) + + +class LiteUpsample(nn.Layer): + def __init__(self, ch_in, ch_out, norm_type='bn'): + super(LiteUpsample, self).__init__() + self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type) + self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type) + + def forward(self, inputs): + deconv_up = self.deconv(inputs) + conv = self.conv(inputs) + interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear') + return deconv_up + interp_up + + +class ShortCut(nn.Layer): + def __init__(self, + layer_num, + ch_in, + ch_out, + norm_type='bn', + lite_neck=False, + name=None): + super(ShortCut, self).__init__() + shortcut_conv = nn.Sequential() + for i in range(layer_num): + fan_out = 3 * 3 * ch_out + std = math.sqrt(2. / fan_out) + in_channels = ch_in if i == 0 else ch_out + shortcut_name = name + '.conv.{}'.format(i) + if lite_neck: + shortcut_conv.add_sublayer( + shortcut_name, + LiteConv( + in_channels=in_channels, + out_channels=ch_out, + with_act=i < layer_num - 1, + norm_type=norm_type)) + else: + shortcut_conv.add_sublayer( + shortcut_name, + nn.Conv2D( + in_channels=in_channels, + out_channels=ch_out, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=Normal(0, std)), + bias_attr=ParamAttr( + learning_rate=2., regularizer=L2Decay(0.)))) + if i < layer_num - 1: + shortcut_conv.add_sublayer(shortcut_name + '.act', + nn.ReLU()) + self.shortcut = self.add_sublayer('shortcut', shortcut_conv) + + def forward(self, feat): + out = self.shortcut(feat) + return out + + +@register +@serializable +class TTFFPN(nn.Layer): + """ + Args: + in_channels (list): number of input feature channels from backbone. + [128,256,512,1024] by default, means the channels of DarkNet53 + backbone return_idx [1,2,3,4]. + planes (list): the number of output feature channels of FPN. + [256, 128, 64] by default + shortcut_num (list): the number of convolution layers in each shortcut. + [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs + in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv. + norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. + bn by default + lite_neck (bool): whether to use lite conv in TTFNet FPN, + False by default + fusion_method (string): the method to fusion upsample and lateral layer. + 'add' and 'concat' are optional, add by default + """ + + __shared__ = ['norm_type'] + + def __init__(self, + in_channels, + planes=[256, 128, 64], + shortcut_num=[3, 2, 1], + norm_type='bn', + lite_neck=False, + fusion_method='add'): + super(TTFFPN, self).__init__() + self.planes = planes + self.shortcut_num = shortcut_num[::-1] + self.shortcut_len = len(shortcut_num) + self.ch_in = in_channels[::-1] + self.fusion_method = fusion_method + + self.upsample_list = [] + self.shortcut_list = [] + self.upper_list = [] + for i, out_c in enumerate(self.planes): + in_c = self.ch_in[i] if i == 0 else self.upper_list[-1] + upsample_module = LiteUpsample if lite_neck else Upsample + upsample = self.add_sublayer( + 'upsample.' + str(i), + upsample_module( + in_c, out_c, norm_type=norm_type)) + self.upsample_list.append(upsample) + if i < self.shortcut_len: + shortcut = self.add_sublayer( + 'shortcut.' + str(i), + ShortCut( + self.shortcut_num[i], + self.ch_in[i + 1], + out_c, + norm_type=norm_type, + lite_neck=lite_neck, + name='shortcut.' + str(i))) + self.shortcut_list.append(shortcut) + if self.fusion_method == 'add': + upper_c = out_c + elif self.fusion_method == 'concat': + upper_c = out_c * 2 + else: + raise ValueError('Illegal fusion method. Expected add or\ + concat, but received {}'.format(self.fusion_method)) + self.upper_list.append(upper_c) + + def forward(self, inputs): + feat = inputs[-1] + for i, out_c in enumerate(self.planes): + feat = self.upsample_list[i](feat) + if i < self.shortcut_len: + shortcut = self.shortcut_list[i](inputs[-i - 2]) + if self.fusion_method == 'add': + feat = feat + shortcut + else: + feat = paddle.concat([feat, shortcut], axis=1) + return feat + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=self.upper_list[-1], )] diff --git a/ppdet/modeling/necks/yolo_fpn.py b/ppdet/modeling/necks/yolo_fpn.py new file mode 100644 index 0000000..2545897 --- /dev/null +++ b/ppdet/modeling/necks/yolo_fpn.py @@ -0,0 +1,961 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from ppdet.core.workspace import register, serializable +from ..backbones.darknet import ConvBNLayer +import numpy as np + +from ..shape_spec import ShapeSpec + +__all__ = ['YOLOv3FPN', 'PPYOLOFPN'] + + +def add_coord(x, data_format): + b = x.shape[0] + if data_format == 'NCHW': + h = x.shape[2] + w = x.shape[3] + else: + h = x.shape[1] + w = x.shape[2] + + gx = paddle.arange(w, dtype='float32') / (w - 1.) * 2.0 - 1. + if data_format == 'NCHW': + gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w]) + else: + gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1]) + gx.stop_gradient = True + + gy = paddle.arange(h, dtype='float32') / (h - 1.) * 2.0 - 1. + if data_format == 'NCHW': + gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w]) + else: + gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1]) + gy.stop_gradient = True + + return gx, gy + + +class YoloDetBlock(nn.Layer): + def __init__(self, ch_in, channel, norm_type, name, data_format='NCHW'): + """ + YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767 + + Args: + ch_in (int): input channel + channel (int): base channel + norm_type (str): batch norm type + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(YoloDetBlock, self).__init__() + self.ch_in = ch_in + self.channel = channel + assert channel % 2 == 0, \ + "channel {} cannot be divided by 2".format(channel) + conv_def = [ + ['conv0', ch_in, channel, 1, '.0.0'], + ['conv1', channel, channel * 2, 3, '.0.1'], + ['conv2', channel * 2, channel, 1, '.1.0'], + ['conv3', channel, channel * 2, 3, '.1.1'], + ['route', channel * 2, channel, 1, '.2'], + ] + + self.conv_module = nn.Sequential() + for idx, (conv_name, ch_in, ch_out, filter_size, + post_name) in enumerate(conv_def): + self.conv_module.add_sublayer( + conv_name, + ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=filter_size, + padding=(filter_size - 1) // 2, + norm_type=norm_type, + data_format=data_format, + name=name + post_name)) + + self.tip = ConvBNLayer( + ch_in=channel, + ch_out=channel * 2, + filter_size=3, + padding=1, + norm_type=norm_type, + data_format=data_format, + name=name + '.tip') + + def forward(self, inputs): + route = self.conv_module(inputs) + tip = self.tip(route) + return route, tip + + +class SPP(nn.Layer): + def __init__(self, + ch_in, + ch_out, + k, + pool_size, + norm_type, + name, + act='leaky', + data_format='NCHW'): + """ + SPP layer, which consist of four pooling layer follwed by conv layer + + Args: + ch_in (int): input channel of conv layer + ch_out (int): output channel of conv layer + k (int): kernel size of conv layer + norm_type (str): batch norm type + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(SPP, self).__init__() + self.pool = [] + self.data_format = data_format + for size in pool_size: + pool = self.add_sublayer( + '{}.pool1'.format(name), + nn.MaxPool2D( + kernel_size=size, + stride=1, + padding=size // 2, + data_format=data_format, + ceil_mode=False)) + self.pool.append(pool) + self.conv = ConvBNLayer( + ch_in, + ch_out, + k, + padding=k // 2, + norm_type=norm_type, + name=name, + act=act, + data_format=data_format) + + def forward(self, x): + outs = [x] + for pool in self.pool: + outs.append(pool(x)) + if self.data_format == "NCHW": + y = paddle.concat(outs, axis=1) + else: + y = paddle.concat(outs, axis=-1) + + y = self.conv(y) + return y + + +class DropBlock(nn.Layer): + def __init__(self, block_size, keep_prob, name, data_format='NCHW'): + """ + DropBlock layer, see https://arxiv.org/abs/1810.12890 + + Args: + block_size (int): block size + keep_prob (int): keep probability + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(DropBlock, self).__init__() + self.block_size = block_size + self.keep_prob = keep_prob + self.name = name + self.data_format = data_format + + def forward(self, x): + if not self.training or self.keep_prob == 1: + return x + else: + gamma = (1. - self.keep_prob) / (self.block_size**2) + if self.data_format == 'NCHW': + shape = x.shape[2:] + else: + shape = x.shape[1:3] + for s in shape: + gamma *= s / (s - self.block_size + 1) + + matrix = paddle.cast(paddle.rand(x.shape, x.dtype) < gamma, x.dtype) + mask_inv = F.max_pool2d( + matrix, + self.block_size, + stride=1, + padding=self.block_size // 2, + data_format=self.data_format) + mask = 1. - mask_inv + y = x * mask * (mask.numel() / mask.sum()) + return y + + +class CoordConv(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + padding, + norm_type, + name, + data_format='NCHW'): + """ + CoordConv layer + + Args: + ch_in (int): input channel + ch_out (int): output channel + filter_size (int): filter size, default 3 + padding (int): padding size, default 0 + norm_type (str): batch norm type, default bn + name (str): layer name + data_format (str): data format, NCHW or NHWC + + """ + super(CoordConv, self).__init__() + self.conv = ConvBNLayer( + ch_in + 2, + ch_out, + filter_size=filter_size, + padding=padding, + norm_type=norm_type, + data_format=data_format, + name=name) + self.data_format = data_format + + def forward(self, x): + gx, gy = add_coord(x, self.data_format) + if self.data_format == 'NCHW': + y = paddle.concat([x, gx, gy], axis=1) + else: + y = paddle.concat([x, gx, gy], axis=-1) + y = self.conv(y) + return y + + +class PPYOLODetBlock(nn.Layer): + def __init__(self, cfg, name, data_format='NCHW'): + """ + PPYOLODetBlock layer + + Args: + cfg (list): layer configs for this block + name (str): block name + data_format (str): data format, NCHW or NHWC + """ + super(PPYOLODetBlock, self).__init__() + self.conv_module = nn.Sequential() + for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]): + kwargs.update( + name='{}.{}'.format(name, conv_name), data_format=data_format) + self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs)) + + conv_name, layer, args, kwargs = cfg[-1] + kwargs.update( + name='{}.{}'.format(name, conv_name), data_format=data_format) + self.tip = layer(*args, **kwargs) + + def forward(self, inputs): + route = self.conv_module(inputs) + tip = self.tip(route) + return route, tip + + +class PPYOLOTinyDetBlock(nn.Layer): + def __init__(self, + ch_in, + ch_out, + name, + drop_block=False, + block_size=3, + keep_prob=0.9, + data_format='NCHW'): + """ + PPYOLO Tiny DetBlock layer + Args: + ch_in (list): input channel number + ch_out (list): output channel number + name (str): block name + drop_block: whether user DropBlock + block_size: drop block size + keep_prob: probability to keep block in DropBlock + data_format (str): data format, NCHW or NHWC + """ + super(PPYOLOTinyDetBlock, self).__init__() + self.drop_block_ = drop_block + self.conv_module = nn.Sequential() + + cfgs = [ + # name, in channels, out channels, filter_size, + # stride, padding, groups + ['.0', ch_in, ch_out, 1, 1, 0, 1], + ['.1', ch_out, ch_out, 5, 1, 2, ch_out], + ['.2', ch_out, ch_out, 1, 1, 0, 1], + ['.route', ch_out, ch_out, 5, 1, 2, ch_out], + ] + for cfg in cfgs: + conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \ + groups = cfg + self.conv_module.add_sublayer( + name + conv_name, + ConvBNLayer( + ch_in=conv_ch_in, + ch_out=conv_ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + name=name + conv_name)) + + self.tip = ConvBNLayer( + ch_in=ch_out, + ch_out=ch_out, + filter_size=1, + stride=1, + padding=0, + groups=1, + name=name + conv_name) + + if self.drop_block_: + self.drop_block = DropBlock( + block_size=block_size, + keep_prob=keep_prob, + data_format=data_format, + name=name + '.dropblock') + + def forward(self, inputs): + if self.drop_block_: + inputs = self.drop_block(inputs) + route = self.conv_module(inputs) + tip = self.tip(route) + return route, tip + + +class PPYOLODetBlockCSP(nn.Layer): + def __init__(self, + cfg, + ch_in, + ch_out, + act, + norm_type, + name, + data_format='NCHW'): + """ + PPYOLODetBlockCSP layer + + Args: + cfg (list): layer configs for this block + ch_in (int): input channel + ch_out (int): output channel + act (str): default mish + name (str): block name + data_format (str): data format, NCHW or NHWC + """ + super(PPYOLODetBlockCSP, self).__init__() + self.data_format = data_format + self.conv1 = ConvBNLayer( + ch_in, + ch_out, + 1, + padding=0, + act=act, + norm_type=norm_type, + name=name + '.left', + data_format=data_format) + self.conv2 = ConvBNLayer( + ch_in, + ch_out, + 1, + padding=0, + act=act, + norm_type=norm_type, + name=name + '.right', + data_format=data_format) + self.conv3 = ConvBNLayer( + ch_out * 2, + ch_out * 2, + 1, + padding=0, + act=act, + norm_type=norm_type, + name=name, + data_format=data_format) + self.conv_module = nn.Sequential() + for idx, (layer_name, layer, args, kwargs) in enumerate(cfg): + kwargs.update(name=name + layer_name, data_format=data_format) + self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs)) + + def forward(self, inputs): + conv_left = self.conv1(inputs) + conv_right = self.conv2(inputs) + conv_left = self.conv_module(conv_left) + if self.data_format == 'NCHW': + conv = paddle.concat([conv_left, conv_right], axis=1) + else: + conv = paddle.concat([conv_left, conv_right], axis=-1) + + conv = self.conv3(conv) + return conv, conv + + +@register +@serializable +class YOLOv3FPN(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + in_channels=[256, 512, 1024], + norm_type='bn', + data_format='NCHW'): + """ + YOLOv3FPN layer + + Args: + in_channels (list): input channels for fpn + norm_type (str): batch norm type, default bn + data_format (str): data format, NCHW or NHWC + + """ + super(YOLOv3FPN, self).__init__() + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels + self.num_blocks = len(in_channels) + + self._out_channels = [] + self.yolo_blocks = [] + self.routes = [] + self.data_format = data_format + for i in range(self.num_blocks): + name = 'yolo_block.{}'.format(i) + in_channel = in_channels[-i - 1] + if i > 0: + in_channel += 512 // (2**i) + yolo_block = self.add_sublayer( + name, + YoloDetBlock( + in_channel, + channel=512 // (2**i), + norm_type=norm_type, + data_format=data_format, + name=name)) + self.yolo_blocks.append(yolo_block) + # tip layer output channel doubled + self._out_channels.append(1024 // (2**i)) + + if i < self.num_blocks - 1: + name = 'yolo_transition.{}'.format(i) + route = self.add_sublayer( + name, + ConvBNLayer( + ch_in=512 // (2**i), + ch_out=256 // (2**i), + filter_size=1, + stride=1, + padding=0, + norm_type=norm_type, + data_format=data_format, + name=name)) + self.routes.append(route) + + def forward(self, blocks): + assert len(blocks) == self.num_blocks + blocks = blocks[::-1] + yolo_feats = [] + for i, block in enumerate(blocks): + if i > 0: + if self.data_format == 'NCHW': + block = paddle.concat([route, block], axis=1) + else: + block = paddle.concat([route, block], axis=-1) + route, tip = self.yolo_blocks[i](block) + yolo_feats.append(tip) + + if i < self.num_blocks - 1: + route = self.routes[i](route) + route = F.interpolate( + route, scale_factor=2., data_format=self.data_format) + + return yolo_feats + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] + + +@register +@serializable +class PPYOLOFPN(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + in_channels=[512, 1024, 2048], + norm_type='bn', + data_format='NCHW', + coord_conv=False, + conv_block_num=2, + drop_block=False, + block_size=3, + keep_prob=0.9, + spp=False): + """ + PPYOLOFPN layer + + Args: + in_channels (list): input channels for fpn + norm_type (str): batch norm type, default bn + data_format (str): data format, NCHW or NHWC + coord_conv (bool): whether use CoordConv or not + conv_block_num (int): conv block num of each pan block + drop_block (bool): whether use DropBlock or not + block_size (int): block size of DropBlock + keep_prob (float): keep probability of DropBlock + spp (bool): whether use spp or not + + """ + super(PPYOLOFPN, self).__init__() + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels + self.num_blocks = len(in_channels) + # parse kwargs + self.coord_conv = coord_conv + self.drop_block = drop_block + self.block_size = block_size + self.keep_prob = keep_prob + self.spp = spp + self.conv_block_num = conv_block_num + self.data_format = data_format + if self.coord_conv: + ConvLayer = CoordConv + else: + ConvLayer = ConvBNLayer + + if self.drop_block: + dropblock_cfg = [[ + 'dropblock', DropBlock, [self.block_size, self.keep_prob], + dict() + ]] + else: + dropblock_cfg = [] + + self._out_channels = [] + self.yolo_blocks = [] + self.routes = [] + for i, ch_in in enumerate(self.in_channels[::-1]): + if i > 0: + ch_in += 512 // (2**i) + channel = 64 * (2**self.num_blocks) // (2**i) + base_cfg = [] + c_in, c_out = ch_in, channel + for j in range(self.conv_block_num): + base_cfg += [ + [ + 'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1], + dict( + padding=0, norm_type=norm_type) + ], + [ + 'conv{}'.format(2 * j + 1), ConvBNLayer, + [c_out, c_out * 2, 3], dict( + padding=1, norm_type=norm_type) + ], + ] + c_in, c_out = c_out * 2, c_out + + base_cfg += [[ + 'route', ConvLayer, [c_in, c_out, 1], dict( + padding=0, norm_type=norm_type) + ], [ + 'tip', ConvLayer, [c_out, c_out * 2, 3], dict( + padding=1, norm_type=norm_type) + ]] + + if self.conv_block_num == 2: + if i == 0: + if self.spp: + spp_cfg = [[ + 'spp', SPP, [channel * 4, channel, 1], dict( + pool_size=[5, 9, 13], norm_type=norm_type) + ]] + else: + spp_cfg = [] + cfg = base_cfg[0:3] + spp_cfg + base_cfg[ + 3:4] + dropblock_cfg + base_cfg[4:6] + else: + cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6] + elif self.conv_block_num == 0: + if self.spp and i == 0: + spp_cfg = [[ + 'spp', SPP, [c_in * 4, c_in, 1], dict( + pool_size=[5, 9, 13], norm_type=norm_type) + ]] + else: + spp_cfg = [] + cfg = spp_cfg + dropblock_cfg + base_cfg + name = 'yolo_block.{}'.format(i) + yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name)) + self.yolo_blocks.append(yolo_block) + self._out_channels.append(channel * 2) + if i < self.num_blocks - 1: + name = 'yolo_transition.{}'.format(i) + route = self.add_sublayer( + name, + ConvBNLayer( + ch_in=channel, + ch_out=256 // (2**i), + filter_size=1, + stride=1, + padding=0, + norm_type=norm_type, + data_format=data_format, + name=name)) + self.routes.append(route) + + def forward(self, blocks): + assert len(blocks) == self.num_blocks + blocks = blocks[::-1] + yolo_feats = [] + for i, block in enumerate(blocks): + if i > 0: + if self.data_format == 'NCHW': + block = paddle.concat([route, block], axis=1) + else: + block = paddle.concat([route, block], axis=-1) + route, tip = self.yolo_blocks[i](block) + yolo_feats.append(tip) + + if i < self.num_blocks - 1: + route = self.routes[i](route) + route = F.interpolate( + route, scale_factor=2., data_format=self.data_format) + + return yolo_feats + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] + + +@register +@serializable +class PPYOLOTinyFPN(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + in_channels=[80, 56, 34], + detection_block_channels=[160, 128, 96], + norm_type='bn', + data_format='NCHW', + **kwargs): + """ + PPYOLO Tiny FPN layer + Args: + in_channels (list): input channels for fpn + detection_block_channels (list): channels in fpn + norm_type (str): batch norm type, default bn + data_format (str): data format, NCHW or NHWC + kwargs: extra key-value pairs, such as parameter of DropBlock and spp + """ + super(PPYOLOTinyFPN, self).__init__() + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels[::-1] + assert len(detection_block_channels + ) > 0, "detection_block_channelslength should > 0" + self.detection_block_channels = detection_block_channels + self.data_format = data_format + self.num_blocks = len(in_channels) + # parse kwargs + self.drop_block = kwargs.get('drop_block', False) + self.block_size = kwargs.get('block_size', 3) + self.keep_prob = kwargs.get('keep_prob', 0.9) + + self.spp_ = kwargs.get('spp', False) + if self.spp_: + self.spp = SPP(self.in_channels[0] * 4, + self.in_channels[0], + k=1, + pool_size=[5, 9, 13], + norm_type=norm_type, + name='spp') + + self._out_channels = [] + self.yolo_blocks = [] + self.routes = [] + for i, ( + ch_in, ch_out + ) in enumerate(zip(self.in_channels, self.detection_block_channels)): + name = 'yolo_block.{}'.format(i) + if i > 0: + ch_in += self.detection_block_channels[i - 1] + yolo_block = self.add_sublayer( + name, + PPYOLOTinyDetBlock( + ch_in, + ch_out, + name, + drop_block=self.drop_block, + block_size=self.block_size, + keep_prob=self.keep_prob)) + self.yolo_blocks.append(yolo_block) + self._out_channels.append(ch_out) + + if i < self.num_blocks - 1: + name = 'yolo_transition.{}'.format(i) + route = self.add_sublayer( + name, + ConvBNLayer( + ch_in=ch_out, + ch_out=ch_out, + filter_size=1, + stride=1, + padding=0, + norm_type=norm_type, + data_format=data_format, + name=name)) + self.routes.append(route) + + def forward(self, blocks): + assert len(blocks) == self.num_blocks + blocks = blocks[::-1] + + yolo_feats = [] + for i, block in enumerate(blocks): + if i == 0 and self.spp_: + block = self.spp(block) + + if i > 0: + if self.data_format == 'NCHW': + block = paddle.concat([route, block], axis=1) + else: + block = paddle.concat([route, block], axis=-1) + route, tip = self.yolo_blocks[i](block) + yolo_feats.append(tip) + + if i < self.num_blocks - 1: + route = self.routes[i](route) + route = F.interpolate( + route, scale_factor=2., data_format=self.data_format) + + return yolo_feats + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] + + +@register +@serializable +class PPYOLOPAN(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + in_channels=[512, 1024, 2048], + norm_type='bn', + data_format='NCHW', + act='mish', + conv_block_num=3, + drop_block=False, + block_size=3, + keep_prob=0.9, + spp=False): + """ + PPYOLOPAN layer with SPP, DropBlock and CSP connection. + + Args: + in_channels (list): input channels for fpn + norm_type (str): batch norm type, default bn + data_format (str): data format, NCHW or NHWC + act (str): activation function, default mish + conv_block_num (int): conv block num of each pan block + drop_block (bool): whether use DropBlock or not + block_size (int): block size of DropBlock + keep_prob (float): keep probability of DropBlock + spp (bool): whether use spp or not + + """ + super(PPYOLOPAN, self).__init__() + assert len(in_channels) > 0, "in_channels length should > 0" + self.in_channels = in_channels + self.num_blocks = len(in_channels) + # parse kwargs + self.drop_block = drop_block + self.block_size = block_size + self.keep_prob = keep_prob + self.spp = spp + self.conv_block_num = conv_block_num + self.data_format = data_format + if self.drop_block: + dropblock_cfg = [[ + 'dropblock', DropBlock, [self.block_size, self.keep_prob], + dict() + ]] + else: + dropblock_cfg = [] + + # fpn + self.fpn_blocks = [] + self.fpn_routes = [] + fpn_channels = [] + for i, ch_in in enumerate(self.in_channels[::-1]): + if i > 0: + ch_in += 512 // (2**(i - 1)) + channel = 512 // (2**i) + base_cfg = [] + for j in range(self.conv_block_num): + base_cfg += [ + # name, layer, args + [ + '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], + dict( + padding=0, act=act, norm_type=norm_type) + ], + [ + '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], + dict( + padding=1, act=act, norm_type=norm_type) + ] + ] + + if i == 0 and self.spp: + base_cfg[3] = [ + 'spp', SPP, [channel * 4, channel, 1], dict( + pool_size=[5, 9, 13], act=act, norm_type=norm_type) + ] + + cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] + name = 'fpn.{}'.format(i) + fpn_block = self.add_sublayer( + name, + PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, + data_format)) + self.fpn_blocks.append(fpn_block) + fpn_channels.append(channel * 2) + if i < self.num_blocks - 1: + name = 'fpn_transition.{}'.format(i) + route = self.add_sublayer( + name, + ConvBNLayer( + ch_in=channel * 2, + ch_out=channel, + filter_size=1, + stride=1, + padding=0, + act=act, + norm_type=norm_type, + data_format=data_format, + name=name)) + self.fpn_routes.append(route) + # pan + self.pan_blocks = [] + self.pan_routes = [] + self._out_channels = [512 // (2**(self.num_blocks - 2)), ] + for i in reversed(range(self.num_blocks - 1)): + name = 'pan_transition.{}'.format(i) + route = self.add_sublayer( + name, + ConvBNLayer( + ch_in=fpn_channels[i + 1], + ch_out=fpn_channels[i + 1], + filter_size=3, + stride=2, + padding=1, + act=act, + norm_type=norm_type, + data_format=data_format, + name=name)) + self.pan_routes = [route, ] + self.pan_routes + base_cfg = [] + ch_in = fpn_channels[i] + fpn_channels[i + 1] + channel = 512 // (2**i) + for j in range(self.conv_block_num): + base_cfg += [ + # name, layer, args + [ + '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], + dict( + padding=0, act=act, norm_type=norm_type) + ], + [ + '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], + dict( + padding=1, act=act, norm_type=norm_type) + ] + ] + + cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] + name = 'pan.{}'.format(i) + pan_block = self.add_sublayer( + name, + PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, + data_format)) + + self.pan_blocks = [pan_block, ] + self.pan_blocks + self._out_channels.append(channel * 2) + + self._out_channels = self._out_channels[::-1] + + def forward(self, blocks): + assert len(blocks) == self.num_blocks + blocks = blocks[::-1] + # fpn + fpn_feats = [] + for i, block in enumerate(blocks): + if i > 0: + if self.data_format == 'NCHW': + block = paddle.concat([route, block], axis=1) + else: + block = paddle.concat([route, block], axis=-1) + route, tip = self.fpn_blocks[i](block) + fpn_feats.append(tip) + + if i < self.num_blocks - 1: + route = self.fpn_routes[i](route) + route = F.interpolate( + route, scale_factor=2., data_format=self.data_format) + + pan_feats = [fpn_feats[-1], ] + route = fpn_feats[self.num_blocks - 1] + for i in reversed(range(self.num_blocks - 1)): + block = fpn_feats[i] + route = self.pan_routes[i](route) + if self.data_format == 'NCHW': + block = paddle.concat([route, block], axis=1) + else: + block = paddle.concat([route, block], axis=-1) + + route, tip = self.pan_blocks[i](block) + pan_feats.append(tip) + + return pan_feats[::-1] + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_channels': [i.channels for i in input_shape], } + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py new file mode 100644 index 0000000..f190a48 --- /dev/null +++ b/ppdet/modeling/ops.py @@ -0,0 +1,1577 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddle import ParamAttr +from paddle.regularizer import L2Decay + +from paddle.fluid.framework import Variable, in_dygraph_mode +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph import layers +from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype +import math +import six +import numpy as np +from functools import reduce + +__all__ = [ + 'roi_pool', + 'roi_align', + 'prior_box', + 'generate_proposals', + 'iou_similarity', + 'box_coder', + 'yolo_box', + 'multiclass_nms', + 'distribute_fpn_proposals', + 'collect_fpn_proposals', + 'matrix_nms', + 'batch_norm', + 'mish', +] + + +def mish(x): + return x * paddle.tanh(F.softplus(x)) + + +def batch_norm(ch, + norm_type='bn', + norm_decay=0., + initializer=None, + data_format='NCHW'): + if norm_type == 'sync_bn': + batch_norm = nn.SyncBatchNorm + else: + batch_norm = nn.BatchNorm2D + + return batch_norm( + ch, + weight_attr=ParamAttr( + initializer=initializer, regularizer=L2Decay(norm_decay)), + bias_attr=ParamAttr(regularizer=L2Decay(norm_decay)), + data_format=data_format) + + +@paddle.jit.not_to_static +def roi_pool(input, + rois, + output_size, + spatial_scale=1.0, + rois_num=None, + name=None): + """ + + This operator implements the roi_pooling layer. + Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7). + + The operator has three steps: + + 1. Dividing each region proposal into equal-sized sections with output_size(h, w); + 2. Finding the largest value in each section; + 3. Copying these max values to the output buffer. + + For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn + + Args: + input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, W is weight. + The data type is float32 or float64. + rois (Tensor): ROIs (Regions of Interest) to pool over. + 2D-Tensor or 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. + Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, + and (x2, y2) is the bottom right coordinates. + output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0 + rois_num (Tensor): The number of RoIs in each image. Default: None + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + + Returns: + Tensor: The pooled feature, 4D-Tensor with the shape of [num_rois, C, output_size[0], output_size[1]]. + + + Examples: + + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + + x = paddle.static.data( + name='data', shape=[None, 256, 32, 32], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[None, 4], dtype='float32') + rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32') + + pool_out = ops.roi_pool( + input=x, + rois=rois, + output_size=(1, 1), + spatial_scale=1.0, + rois_num=rois_num) + """ + check_type(output_size, 'output_size', (int, tuple), 'roi_pool') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + if in_dygraph_mode(): + assert rois_num is not None, "rois_num should not be None in dygraph mode." + pool_out, argmaxes = core.ops.roi_pool( + input, rois, rois_num, "pooled_height", pooled_height, + "pooled_width", pooled_width, "spatial_scale", spatial_scale) + return pool_out, argmaxes + + else: + check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool') + check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool') + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + + inputs = { + "X": input, + "ROIs": rois, + } + if rois_num is not None: + inputs['RoisNum'] = rois_num + helper.append_op( + type="roi_pool", + inputs=inputs, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out, argmaxes + + +@paddle.jit.not_to_static +def roi_align(input, + rois, + output_size, + spatial_scale=1.0, + sampling_ratio=-1, + rois_num=None, + aligned=True, + name=None): + """ + + Region of interest align (also known as RoI align) is to perform + bilinear interpolation on inputs of nonuniform sizes to obtain + fixed-size feature maps (e.g. 7*7) + + Dividing each region proposal into equal-sized sections with + the pooled_width and pooled_height. Location remains the origin + result. + + In each ROI bin, the value of the four regularly sampled locations + are computed directly through bilinear interpolation. The output is + the mean of four locations. + Thus avoid the misaligned problem. + + Args: + input (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, W is weight. + The data type is float32 or float64. + rois (Tensor): ROIs (Regions of Interest) to pool over.It should be + a 2-D Tensor or 2-D LoDTensor of shape (num_rois, 4), the lod level is 1. + The data type is float32 or float64. Given as [[x1, y1, x2, y2], ...], + (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates. + output_size (int or tuple[int, int]): The pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float32, optional): Multiplicative spatial scale factor to translate ROI coords + from their input scale to the scale used when pooling. Default: 1.0 + sampling_ratio(int32, optional): number of sampling points in the interpolation grid. + If <=0, then grid points are adaptive to roi_width and pooled_w, likewise for height. Default: -1 + rois_num (Tensor): The number of RoIs in each image. Default: None + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tensor: + + Output: The output of ROIAlignOp is a 4-D tensor with shape (num_rois, channels, pooled_h, pooled_w). The data type is float32 or float64. + + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + + x = paddle.static.data( + name='data', shape=[None, 256, 32, 32], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[None, 4], dtype='float32') + rois_num = paddle.static.data(name='rois_num', shape=[None], dtype='int32') + align_out = ops.roi_align(input=x, + rois=rois, + ouput_size=(7, 7), + spatial_scale=0.5, + sampling_ratio=-1, + rois_num=rois_num) + """ + check_type(output_size, 'output_size', (int, tuple), 'roi_align') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + + if in_dygraph_mode(): + assert rois_num is not None, "rois_num should not be None in dygraph mode." + align_out = core.ops.roi_align( + input, rois, rois_num, "pooled_height", pooled_height, + "pooled_width", pooled_width, "spatial_scale", spatial_scale, + "sampling_ratio", sampling_ratio, "aligned", aligned) + return align_out + + else: + check_variable_and_dtype(input, 'input', ['float32', 'float64'], + 'roi_align') + check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], + 'roi_align') + helper = LayerHelper('roi_align', **locals()) + dtype = helper.input_dtype() + align_out = helper.create_variable_for_type_inference(dtype) + inputs = { + "X": input, + "ROIs": rois, + } + if rois_num is not None: + inputs['RoisNum'] = rois_num + helper.append_op( + type="roi_align", + inputs=inputs, + outputs={"Out": align_out}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale, + "sampling_ratio": sampling_ratio, + "aligned": aligned, + }) + return align_out + + +@paddle.jit.not_to_static +def iou_similarity(x, y, box_normalized=True, name=None): + """ + Computes intersection-over-union (IOU) between two box lists. + Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, + boxes in 'Y' are shared by all instance of the batched inputs of X. + Given two boxes A and B, the calculation of IOU is as follows: + + $$ + IOU(A, B) = + \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)} + $$ + + Args: + x (Tensor): Box list X is a 2-D Tensor with shape [N, 4] holds N + boxes, each box is represented as [xmin, ymin, xmax, ymax], + the shape of X is [N, 4]. [xmin, ymin] is the left top + coordinate of the box if the input is image feature map, they + are close to the origin of the coordinate system. + [xmax, ymax] is the right bottom coordinate of the box. + The data type is float32 or float64. + y (Tensor): Box list Y holds M boxes, each box is represented as + [xmin, ymin, xmax, ymax], the shape of X is [N, 4]. + [xmin, ymin] is the left top coordinate of the box if the + input is image feature map, and [xmax, ymax] is the right + bottom coordinate of the box. The data type is float32 or float64. + box_normalized(bool): Whether treat the priorbox as a normalized box. + Set true by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tensor: The output of iou_similarity op, a tensor with shape [N, M] + representing pairwise iou scores. The data type is same with x. + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + + x = paddle.static.data(name='x', shape=[None, 4], dtype='float32') + y = paddle.static.data(name='y', shape=[None, 4], dtype='float32') + iou = ops.iou_similarity(x=x, y=y) + """ + + if in_dygraph_mode(): + out = core.ops.iou_similarity(x, y, 'box_normalized', box_normalized) + return out + else: + helper = LayerHelper("iou_similarity", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type="iou_similarity", + inputs={"X": x, + "Y": y}, + attrs={"box_normalized": box_normalized}, + outputs={"Out": out}) + return out + + +@paddle.jit.not_to_static +def collect_fpn_proposals(multi_rois, + multi_scores, + min_level, + max_level, + post_nms_top_n, + rois_num_per_level=None, + name=None): + """ + + **This OP only supports LoDTensor as input**. Concat multi-level RoIs + (Region of Interest) and select N RoIs with respect to multi_scores. + This operation performs the following steps: + + 1. Choose num_level RoIs and scores as input: num_level = max_level - min_level + 2. Concat multi-level RoIs and scores + 3. Sort scores and select post_nms_top_n scores + 4. Gather RoIs by selected indices from scores + 5. Re-sort RoIs by corresponding batch_id + + Args: + multi_rois(list): List of RoIs to collect. Element in list is 2-D + LoDTensor with shape [N, 4] and data type is float32 or float64, + N is the number of RoIs. + multi_scores(list): List of scores of RoIs to collect. Element in list + is 2-D LoDTensor with shape [N, 1] and data type is float32 or + float64, N is the number of RoIs. + min_level(int): The lowest level of FPN layer to collect + max_level(int): The highest level of FPN layer to collect + post_nms_top_n(int): The number of selected RoIs + rois_num_per_level(list, optional): The List of RoIs' numbers. + Each element is 1-D Tensor which contains the RoIs' number of each + image on each level and the shape is [B] and data type is + int32, B is the number of images. If it is not None then return + a 1-D Tensor contains the output RoIs' number of each image and + the shape is [B]. Default: None + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Variable: + + fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is + float32 or float64. Selected RoIs. + + rois_num(Tensor): 1-D Tensor contains the RoIs's number of each + image. The shape is [B] and data type is int32. B is the number of + images. + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + multi_rois = [] + multi_scores = [] + for i in range(4): + multi_rois.append(paddle.static.data( + name='roi_'+str(i), shape=[None, 4], dtype='float32', lod_level=1)) + for i in range(4): + multi_scores.append(paddle.static.data( + name='score_'+str(i), shape=[None, 1], dtype='float32', lod_level=1)) + + fpn_rois = ops.collect_fpn_proposals( + multi_rois=multi_rois, + multi_scores=multi_scores, + min_level=2, + max_level=5, + post_nms_top_n=2000) + """ + check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals') + check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals') + num_lvl = max_level - min_level + 1 + input_rois = multi_rois[:num_lvl] + input_scores = multi_scores[:num_lvl] + + if in_dygraph_mode(): + assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode." + attrs = ('post_nms_topN', post_nms_top_n) + output_rois, rois_num = core.ops.collect_fpn_proposals( + input_rois, input_scores, rois_num_per_level, *attrs) + return output_rois, rois_num + + else: + helper = LayerHelper('collect_fpn_proposals', **locals()) + dtype = helper.input_dtype('multi_rois') + check_dtype(dtype, 'multi_rois', ['float32', 'float64'], + 'collect_fpn_proposals') + output_rois = helper.create_variable_for_type_inference(dtype) + output_rois.stop_gradient = True + + inputs = { + 'MultiLevelRois': input_rois, + 'MultiLevelScores': input_scores, + } + outputs = {'FpnRois': output_rois} + if rois_num_per_level is not None: + inputs['MultiLevelRoIsNum'] = rois_num_per_level + rois_num = helper.create_variable_for_type_inference(dtype='int32') + rois_num.stop_gradient = True + outputs['RoisNum'] = rois_num + helper.append_op( + type='collect_fpn_proposals', + inputs=inputs, + outputs=outputs, + attrs={'post_nms_topN': post_nms_top_n}) + return output_rois, rois_num + + +@paddle.jit.not_to_static +def distribute_fpn_proposals(fpn_rois, + min_level, + max_level, + refer_level, + refer_scale, + pixel_offset=False, + rois_num=None, + name=None): + """ + + **This op only takes LoDTensor as input.** In Feature Pyramid Networks + (FPN) models, it is needed to distribute all proposals into different FPN + level, with respect to scale of the proposals, the referring scale and the + referring level. Besides, to restore the order of proposals, we return an + array which indicates the original index of rois in current proposals. + To compute FPN level for each roi, the formula is given as follows: + + .. math:: + + roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} + + level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) + + where BBoxArea is a function to compute the area of each roi. + + Args: + + fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is + float32 or float64. The input fpn_rois. + min_level(int32): The lowest level of FPN layer where the proposals come + from. + max_level(int32): The highest level of FPN layer where the proposals + come from. + refer_level(int32): The referring level of FPN layer with specified scale. + refer_scale(int32): The referring scale of FPN layer with specified level. + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + The shape is [B] and data type is int32. B is the number of images. + If it is not None then return a list of 1-D Tensor. Each element + is the output RoIs' number of each image on the corresponding level + and the shape is [B]. None by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tuple: + + multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] + and data type of float32 and float64. The length is + max_level-min_level+1. The proposals in each FPN level. + + restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is + the number of total rois. The data type is int32. It is + used to restore the order of fpn_rois. + + rois_num_per_level(List): A list of 1-D Tensor and each Tensor is + the RoIs' number in each image on the corresponding level. The shape + is [B] and data type of int32. B is the number of images + + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + fpn_rois = paddle.static.data( + name='data', shape=[None, 4], dtype='float32', lod_level=1) + multi_rois, restore_ind = ops.distribute_fpn_proposals( + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + """ + num_lvl = max_level - min_level + 1 + + if in_dygraph_mode(): + assert rois_num is not None, "rois_num should not be None in dygraph mode." + attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', + refer_level, 'refer_scale', refer_scale, 'pixel_offset', + pixel_offset) + multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals( + fpn_rois, rois_num, num_lvl, num_lvl, *attrs) + return multi_rois, restore_ind, rois_num_per_level + + else: + check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], + 'distribute_fpn_proposals') + helper = LayerHelper('distribute_fpn_proposals', **locals()) + dtype = helper.input_dtype('fpn_rois') + multi_rois = [ + helper.create_variable_for_type_inference(dtype) + for i in range(num_lvl) + ] + + restore_ind = helper.create_variable_for_type_inference(dtype='int32') + + inputs = {'FpnRois': fpn_rois} + outputs = { + 'MultiFpnRois': multi_rois, + 'RestoreIndex': restore_ind, + } + + if rois_num is not None: + inputs['RoisNum'] = rois_num + rois_num_per_level = [ + helper.create_variable_for_type_inference(dtype='int32') + for i in range(num_lvl) + ] + outputs['MultiLevelRoIsNum'] = rois_num_per_level + + helper.append_op( + type='distribute_fpn_proposals', + inputs=inputs, + outputs=outputs, + attrs={ + 'min_level': min_level, + 'max_level': max_level, + 'refer_level': refer_level, + 'refer_scale': refer_scale, + 'pixel_offset': pixel_offset + }) + return multi_rois, restore_ind, rois_num_per_level + + +@paddle.jit.not_to_static +def yolo_box( + x, + origin_shape, + anchors, + class_num, + conf_thresh, + downsample_ratio, + clip_bbox=True, + scale_x_y=1., + name=None, ): + """ + + This operator generates YOLO detection boxes from output of YOLOv3 network. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors. In the second dimension(the channel + dimension), C should be equal to S * (5 + class_num), class_num is the object + category number of source dataset(such as 80 in coco dataset), so the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor + box. + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box + predictions should be as follows: + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + in the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + The logistic regression value of the 5th channel of each anchor prediction boxes + represents the confidence score of each prediction box, and the logistic + regression value of the last :attr:`class_num` channels of each anchor prediction + boxes represents the classifcation scores. Boxes with confidence scores less than + :attr:`conf_thresh` should be ignored, and box final scores is the product of + confidence scores and classification scores. + $$ + score_{pred} = score_{conf} * score_{class} + $$ + + Args: + x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with shape of [N, C, H, W]. + The second dimension(C) stores box locations, confidence score and + classification one-hot keys of each anchor box. Generally, X should be the output of YOLOv3 network. + The data type is float32 or float64. + origin_shape (Tensor): The image size tensor of YoloBox operator, This is a 2-D tensor with shape of [N, 2]. + This tensor holds height and width of each input image used for resizing output box in input image + scale. The data type is int32. + anchors (list|tuple): The anchor width and height, it will be parsed pair by pair. + class_num (int): The number of classes to predict. + conf_thresh (float): The confidence scores threshold of detection boxes. Boxes with confidence scores + under threshold should be ignored. + downsample_ratio (int): The downsample ratio from network input to YoloBox operator input, + so 32, 16, 8 should be set for the first, second, and thrid YoloBox operators. + clip_bbox (bool): Whether clip output bonding box in Input(ImgSize) boundary. Default true. + scale_x_y (float): Scale the center point of decoded bounding box. Default 1.0. + name (string): The default value is None. Normally there is no need + for user to set this property. For more information, + please refer to :ref:`api_guide_Name` + + Returns: + boxes Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes, N is the batch num, + M is output box number, and the 3rd dimension stores [xmin, ymin, xmax, ymax] coordinates of boxes. + scores Tensor: A 3-D tensor with shape [N, M, :attr:`class_num`], the coordinates of boxes, N is the batch num, + M is output box number. + + Raises: + TypeError: Attr anchors of yolo box must be list or tuple + TypeError: Attr class_num of yolo box must be an integer + TypeError: Attr conf_thresh of yolo box must be a float number + + Examples: + + .. code-block:: python + + import paddle + from ppdet.modeling import ops + + paddle.enable_static() + x = paddle.static.data(name='x', shape=[None, 255, 13, 13], dtype='float32') + img_size = paddle.static.data(name='img_size',shape=[None, 2],dtype='int64') + anchors = [10, 13, 16, 30, 33, 23] + boxes,scores = ops.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, + conf_thresh=0.01, downsample_ratio=32) + """ + helper = LayerHelper('yolo_box', **locals()) + + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolo_box must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolo_box must be an integer") + if not isinstance(conf_thresh, float): + raise TypeError("Attr ignore_thresh of yolo_box must be a float number") + + if in_dygraph_mode(): + attrs = ('anchors', anchors, 'class_num', class_num, 'conf_thresh', + conf_thresh, 'downsample_ratio', downsample_ratio, 'clip_bbox', + clip_bbox, 'scale_x_y', scale_x_y) + boxes, scores = core.ops.yolo_box(x, origin_shape, *attrs) + return boxes, scores + else: + boxes = helper.create_variable_for_type_inference(dtype=x.dtype) + scores = helper.create_variable_for_type_inference(dtype=x.dtype) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "conf_thresh": conf_thresh, + "downsample_ratio": downsample_ratio, + "clip_bbox": clip_bbox, + "scale_x_y": scale_x_y, + } + + helper.append_op( + type='yolo_box', + inputs={ + "X": x, + "ImgSize": origin_shape, + }, + outputs={ + 'Boxes': boxes, + 'Scores': scores, + }, + attrs=attrs) + return boxes, scores + + +@paddle.jit.not_to_static +def prior_box(input, + image, + min_sizes, + max_sizes=None, + aspect_ratios=[1.], + variance=[0.1, 0.1, 0.2, 0.2], + flip=False, + clip=False, + steps=[0.0, 0.0], + offset=0.5, + min_max_aspect_ratios_order=False, + name=None): + """ + + This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm. + Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + + Parameters: + input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64. + image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp, + the data type should be float32 or float64. + min_sizes(list|tuple|float): the min sizes of generated prior boxes. + max_sizes(list|tuple|None): the max sizes of generated prior boxes. + Default: None. + aspect_ratios(list|tuple|float): the aspect ratios of generated + prior boxes. Default: [1.]. + variance(list|tuple): the variances to be encoded in prior boxes. + Default:[0.1, 0.1, 0.2, 0.2]. + flip(bool): Whether to flip aspect ratios. Default:False. + clip(bool): Whether to clip out-of-boundary boxes. Default: False. + step(list|tuple): Prior boxes step across width and height, If + step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across + height or weight of the input will be automatically calculated. + Default: [0., 0.] + offset(float): Prior boxes center offset. Default: 0.5 + min_max_aspect_ratios_order(bool): If set True, the output prior box is + in order of [min, max, aspect_ratios], which is consistent with + Caffe. Please note, this order affects the weights order of + convolution layer followed by and does not affect the final + detection results. Default: False. + name(str, optional): The default value is None. Normally there is no need for + user to set this property. For more information, please refer to :ref:`api_guide_Name` + + Returns: + Tuple: A tuple with two Variable (boxes, variances) + + boxes(Tensor): the output prior boxes of PriorBox. + 4-D tensor, the layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input, + num_priors is the total box count of each position of input. + + variances(Tensor): the expanded variances of PriorBox. + 4-D tensor, the layput is [H, W, num_priors, 4]. + H is the height of input, W is the width of input + num_priors is the total box count of each position of input + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + + paddle.enable_static() + input = paddle.static.data(name="input", shape=[None,3,6,9]) + image = paddle.static.data(name="image", shape=[None,3,9,12]) + box, var = ops.prior_box( + input=input, + image=image, + min_sizes=[100.], + clip=True, + flip=True) + """ + helper = LayerHelper("prior_box", **locals()) + dtype = helper.input_dtype() + check_variable_and_dtype( + input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box') + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(min_sizes): + min_sizes = [min_sizes] + if not _is_list_or_tuple_(aspect_ratios): + aspect_ratios = [aspect_ratios] + if not (_is_list_or_tuple_(steps) and len(steps) == 2): + raise ValueError('steps should be a list or tuple ', + 'with length 2, (step_width, step_height).') + + min_sizes = list(map(float, min_sizes)) + aspect_ratios = list(map(float, aspect_ratios)) + steps = list(map(float, steps)) + + cur_max_sizes = None + if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0: + if not _is_list_or_tuple_(max_sizes): + max_sizes = [max_sizes] + cur_max_sizes = max_sizes + + if in_dygraph_mode(): + attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios, + 'variances', variance, 'flip', flip, 'clip', clip, 'step_w', + steps[0], 'step_h', steps[1], 'offset', offset, + 'min_max_aspect_ratios_order', min_max_aspect_ratios_order) + if cur_max_sizes is not None: + attrs += ('max_sizes', cur_max_sizes) + box, var = core.ops.prior_box(input, image, *attrs) + return box, var + else: + attrs = { + 'min_sizes': min_sizes, + 'aspect_ratios': aspect_ratios, + 'variances': variance, + 'flip': flip, + 'clip': clip, + 'step_w': steps[0], + 'step_h': steps[1], + 'offset': offset, + 'min_max_aspect_ratios_order': min_max_aspect_ratios_order + } + + if cur_max_sizes is not None: + attrs['max_sizes'] = cur_max_sizes + + box = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="prior_box", + inputs={"Input": input, + "Image": image}, + outputs={"Boxes": box, + "Variances": var}, + attrs=attrs, ) + box.stop_gradient = True + var.stop_gradient = True + return box, var + + +@paddle.jit.not_to_static +def multiclass_nms(bboxes, + scores, + score_threshold, + nms_top_k, + keep_top_k, + nms_threshold=0.3, + normalized=True, + nms_eta=1., + background_label=-1, + return_index=False, + return_rois_num=True, + rois_num=None, + name=None): + """ + This operator is to do multi-class non maximum suppression (NMS) on + boxes and scores. + In the NMS step, this operator greedily selects a subset of detection bounding + boxes that have high scores larger than score_threshold, if providing this + threshold, then selects the largest nms_top_k confidences scores if nms_top_k + is larger than -1. Then this operator pruns away boxes that have high IOU + (intersection over union) overlap with already selected boxes by adaptive + threshold NMS based on parameters of nms_threshold and nms_eta. + Aftern NMS step, at most keep_top_k number of total bboxes are to be kept + per image if keep_top_k is larger than -1. + Args: + bboxes (Tensor): Two types of bboxes are supported: + 1. (Tensor) A 3-D Tensor with shape + [N, M, 4 or 8 16 24 32] represents the + predicted locations of M bounding bboxes, + N is the batch size. Each bounding box has four + coordinate values and the layout is + [xmin, ymin, xmax, ymax], when box size equals to 4. + 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4] + M is the number of bounding boxes, C is the + class number + scores (Tensor): Two types of scores are supported: + 1. (Tensor) A 3-D Tensor with shape [N, C, M] + represents the predicted confidence predictions. + N is the batch size, C is the class number, M is + number of bounding boxes. For each category there + are total M scores which corresponding M bounding + boxes. Please note, M is equal to the 2nd dimension + of BBoxes. + 2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. + M is the number of bbox, C is the class number. + In this case, input BBoxes should be the second + case with shape [M, C, 4]. + background_label (int): The index of background label, the background + label will be ignored. If set to -1, then all + categories will be considered. Default: 0 + score_threshold (float): Threshold to filter out bounding boxes with + low confidence score. If not provided, + consider all boxes. + nms_top_k (int): Maximum number of detections to be kept according to + the confidences after the filtering detections based + on score_threshold. + nms_threshold (float): The threshold to be used in NMS. Default: 0.3 + nms_eta (float): The threshold to be used in NMS. Default: 1.0 + keep_top_k (int): Number of total bboxes to be kept per image after NMS + step. -1 means keeping all bboxes after NMS step. + normalized (bool): Whether detections are normalized. Default: True + return_index(bool): Whether return selected index. Default: False + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + The shape is [B] and data type is int32. B is the number of images. + If it is not None then return a list of 1-D Tensor. Each element + is the output RoIs' number of each image on the corresponding level + and the shape is [B]. None by default. + name(str): Name of the multiclass nms op. Default: None. + Returns: + A tuple with two Variables: (Out, Index) if return_index is True, + otherwise, a tuple with one Variable(Out) is returned. + Out: A 2-D LoDTensor with shape [No, 6] represents the detections. + Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] + or A 2-D LoDTensor with shape [No, 10] represents the detections. + Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3, + x4, y4]. No is the total number of detections. + If all images have not detected results, all elements in LoD will be + 0, and output tensor is empty (None). + Index: Only return when return_index is True. A 2-D LoDTensor with + shape [No, 1] represents the selected index which type is Integer. + The index is the absolute value cross batches. No is the same number + as Out. If the index is used to gather other attribute such as age, + one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where + N is the batch size and M is the number of boxes. + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + boxes = paddle.static.data(name='bboxes', shape=[81, 4], + dtype='float32', lod_level=1) + scores = paddle.static.data(name='scores', shape=[81], + dtype='float32', lod_level=1) + out, index = ops.multiclass_nms(bboxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True) + """ + helper = LayerHelper('multiclass_nms3', **locals()) + + if in_dygraph_mode(): + attrs = ('background_label', background_label, 'score_threshold', + score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold', + nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta, + 'normalized', normalized) + output, index, nms_rois_num = core.ops.multiclass_nms3(bboxes, scores, + rois_num, *attrs) + if return_index: + index = None + return output, nms_rois_num, index + + else: + output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) + index = helper.create_variable_for_type_inference(dtype='int') + + inputs = {'BBoxes': bboxes, 'Scores': scores} + outputs = {'Out': output, 'Index': index} + + if rois_num is not None: + inputs['RoisNum'] = rois_num + + if return_rois_num: + nms_rois_num = helper.create_variable_for_type_inference( + dtype='int32') + outputs['NmsRoisNum'] = nms_rois_num + + helper.append_op( + type="multiclass_nms3", + inputs=inputs, + attrs={ + 'background_label': background_label, + 'score_threshold': score_threshold, + 'nms_top_k': nms_top_k, + 'nms_threshold': nms_threshold, + 'keep_top_k': keep_top_k, + 'nms_eta': nms_eta, + 'normalized': normalized + }, + outputs=outputs) + output.stop_gradient = True + index.stop_gradient = True + if not return_index: + index = None + if not return_rois_num: + nms_rois_num = None + + return output, nms_rois_num, index + + +@paddle.jit.not_to_static +def matrix_nms(bboxes, + scores, + score_threshold, + post_threshold, + nms_top_k, + keep_top_k, + use_gaussian=False, + gaussian_sigma=2., + background_label=0, + normalized=True, + return_index=False, + return_rois_num=True, + name=None): + """ + **Matrix NMS** + This operator does matrix non maximum suppression (NMS). + First selects a subset of candidate bounding boxes that have higher scores + than score_threshold (if provided), then the top k candidate is selected if + nms_top_k is larger than -1. Score of the remaining candidate are then + decayed according to the Matrix NMS scheme. + Aftern NMS step, at most keep_top_k number of total bboxes are to be kept + per image if keep_top_k is larger than -1. + Args: + bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the + predicted locations of M bounding bboxes, + N is the batch size. Each bounding box has four + coordinate values and the layout is + [xmin, ymin, xmax, ymax], when box size equals to 4. + The data type is float32 or float64. + scores (Tensor): A 3-D Tensor with shape [N, C, M] + represents the predicted confidence predictions. + N is the batch size, C is the class number, M is + number of bounding boxes. For each category there + are total M scores which corresponding M bounding + boxes. Please note, M is equal to the 2nd dimension + of BBoxes. The data type is float32 or float64. + score_threshold (float): Threshold to filter out bounding boxes with + low confidence score. + post_threshold (float): Threshold to filter out bounding boxes with + low confidence score AFTER decaying. + nms_top_k (int): Maximum number of detections to be kept according to + the confidences after the filtering detections based + on score_threshold. + keep_top_k (int): Number of total bboxes to be kept per image after NMS + step. -1 means keeping all bboxes after NMS step. + use_gaussian (bool): Use Gaussian as the decay function. Default: False + gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 + background_label (int): The index of background label, the background + label will be ignored. If set to -1, then all + categories will be considered. Default: 0 + normalized (bool): Whether detections are normalized. Default: True + return_index(bool): Whether return selected index. Default: False + return_rois_num(bool): whether return rois_num. Default: True + name(str): Name of the matrix nms op. Default: None. + Returns: + A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, + otherwise, a tuple with two Tensor (Out, RoisNum) is returned. + Out (Tensor): A 2-D Tensor with shape [No, 6] containing the + detection results. + Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] + (After version 1.3, when no boxes detected, the lod is changed + from {0} to {1}) + Index (Tensor): A 2-D Tensor with shape [No, 1] containing the + selected indices, which are absolute values cross batches. + rois_num (Tensor): A 1-D Tensor with shape [N] containing + the number of detected boxes in each image. + Examples: + .. code-block:: python + import paddle + from ppdet.modeling import ops + boxes = paddle.static.data(name='bboxes', shape=[None,81, 4], + dtype='float32', lod_level=1) + scores = paddle.static.data(name='scores', shape=[None,81], + dtype='float32', lod_level=1) + out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0, + score_threshold=0.5, post_threshold=0.1, + nms_top_k=400, keep_top_k=200, normalized=False) + """ + check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'], + 'matrix_nms') + check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'], + 'matrix_nms') + check_type(score_threshold, 'score_threshold', float, 'matrix_nms') + check_type(post_threshold, 'post_threshold', float, 'matrix_nms') + check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms') + check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms') + check_type(normalized, 'normalized', bool, 'matrix_nms') + check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms') + check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms') + check_type(background_label, 'background_label', int, 'matrix_nms') + + if in_dygraph_mode(): + attrs = ('background_label', background_label, 'score_threshold', + score_threshold, 'post_threshold', post_threshold, 'nms_top_k', + nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian', + use_gaussian, 'keep_top_k', keep_top_k, 'normalized', + normalized) + out, index, rois_num = core.ops.matrix_nms(bboxes, scores, *attrs) + if not return_index: + index = None + if not return_rois_num: + rois_num = None + return out, rois_num, index + else: + helper = LayerHelper('matrix_nms', **locals()) + output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) + index = helper.create_variable_for_type_inference(dtype='int') + outputs = {'Out': output, 'Index': index} + if return_rois_num: + rois_num = helper.create_variable_for_type_inference(dtype='int') + outputs['RoisNum'] = rois_num + + helper.append_op( + type="matrix_nms", + inputs={'BBoxes': bboxes, + 'Scores': scores}, + attrs={ + 'background_label': background_label, + 'score_threshold': score_threshold, + 'post_threshold': post_threshold, + 'nms_top_k': nms_top_k, + 'gaussian_sigma': gaussian_sigma, + 'use_gaussian': use_gaussian, + 'keep_top_k': keep_top_k, + 'normalized': normalized + }, + outputs=outputs) + output.stop_gradient = True + + if not return_index: + index = None + if not return_rois_num: + rois_num = None + return output, rois_num, index + + +def bipartite_match(dist_matrix, + match_type=None, + dist_threshold=None, + name=None): + """ + + This operator implements a greedy bipartite matching algorithm, which is + used to obtain the matching with the maximum distance based on the input + distance matrix. For input 2D matrix, the bipartite matching algorithm can + find the matched column for each row (matched means the largest distance), + also can find the matched row for each column. And this operator only + calculate matched indices from column to row. For each instance, + the number of matched indices is the column number of the input distance + matrix. **The OP only supports CPU**. + + There are two outputs, matched indices and distance. + A simple description, this algorithm matched the best (maximum distance) + row entity to the column entity and the matched indices are not duplicated + in each row of ColToRowMatchIndices. If the column entity is not matched + any row entity, set -1 in ColToRowMatchIndices. + + NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor. + If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. + If Tensor, the height of ColToRowMatchIndices is 1. + + NOTE: This API is a very low level API. It is used by :code:`ssd_loss` + layer. Please consider to use :code:`ssd_loss` instead. + + Args: + dist_matrix(Tensor): This input is a 2-D LoDTensor with shape + [K, M]. The data type is float32 or float64. It is pair-wise + distance matrix between the entities represented by each row and + each column. For example, assumed one entity is A with shape [K], + another entity is B with shape [M]. The dist_matrix[i][j] is the + distance between A[i] and B[j]. The bigger the distance is, the + better matching the pairs are. NOTE: This tensor can contain LoD + information to represent a batch of inputs. One instance of this + batch can contain different numbers of entities. + match_type(str, optional): The type of matching method, should be + 'bipartite' or 'per_prediction'. None ('bipartite') by default. + dist_threshold(float32, optional): If `match_type` is 'per_prediction', + this threshold is to determine the extra matching bboxes based + on the maximum distance, 0.5 by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tuple: + + matched_indices(Tensor): A 2-D Tensor with shape [N, M]. The data + type is int32. N is the batch size. If match_indices[i][j] is -1, it + means B[j] does not match any entity in i-th instance. + Otherwise, it means B[j] is matched to row + match_indices[i][j] in i-th instance. The row number of + i-th instance is saved in match_indices[i][j]. + + matched_distance(Tensor): A 2-D Tensor with shape [N, M]. The data + type is float32. N is batch size. If match_indices[i][j] is -1, + match_distance[i][j] is also -1.0. Otherwise, assumed + match_distance[i][j] = d, and the row offsets of each instance + are called LoD. Then match_distance[i][j] = + dist_matrix[d+LoD[i]][j]. + + Examples: + + .. code-block:: python + import paddle + from ppdet.modeling import ops + from ppdet.modeling.utils import iou_similarity + + paddle.enable_static() + + x = paddle.static.data(name='x', shape=[None, 4], dtype='float32') + y = paddle.static.data(name='y', shape=[None, 4], dtype='float32') + iou = iou_similarity(x=x, y=y) + matched_indices, matched_dist = ops.bipartite_match(iou) + """ + check_variable_and_dtype(dist_matrix, 'dist_matrix', + ['float32', 'float64'], 'bipartite_match') + + if in_dygraph_mode(): + match_indices, match_distance = core.ops.bipartite_match( + dist_matrix, "match_type", match_type, "dist_threshold", + dist_threshold) + return match_indices, match_distance + + helper = LayerHelper('bipartite_match', **locals()) + match_indices = helper.create_variable_for_type_inference(dtype='int32') + match_distance = helper.create_variable_for_type_inference( + dtype=dist_matrix.dtype) + helper.append_op( + type='bipartite_match', + inputs={'DistMat': dist_matrix}, + attrs={ + 'match_type': match_type, + 'dist_threshold': dist_threshold, + }, + outputs={ + 'ColToRowMatchIndices': match_indices, + 'ColToRowMatchDist': match_distance + }) + return match_indices, match_distance + + +@paddle.jit.not_to_static +def box_coder(prior_box, + prior_box_var, + target_box, + code_type="encode_center_size", + box_normalized=True, + axis=0, + name=None): + """ + **Box Coder Layer** + Encode/Decode the target bounding box with the priorbox information. + + The Encoding schema described below: + .. math:: + ox = (tx - px) / pw / pxv + oy = (ty - py) / ph / pyv + ow = \log(\abs(tw / pw)) / pwv + oh = \log(\abs(th / ph)) / phv + The Decoding schema described below: + + .. math:: + + ox = (pw * pxv * tx * + px) - tw / 2 + oy = (ph * pyv * ty * + py) - th / 2 + ow = \exp(pwv * tw) * pw + tw / 2 + oh = \exp(phv * th) * ph + th / 2 + where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, + width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote + the priorbox's (anchor) center coordinates, width and height. `pxv`, + `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, + `ow`, `oh` denote the encoded/decoded coordinates, width and height. + During Box Decoding, two modes for broadcast are supported. Say target + box has shape [N, M, 4], and the shape of prior box can be [N, 4] or + [M, 4]. Then prior box will broadcast to target box along the + assigned axis. + + Args: + prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape + [M, 4] holds M boxes and data type is float32 or float64. Each box + is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the + left top coordinate of the anchor box, if the input is image feature + map, they are close to the origin of the coordinate system. + [xmax, ymax] is the right bottom coordinate of the anchor box. + prior_box_var(List|Tensor|None): prior_box_var supports three types + of input. One is Tensor with shape [M, 4] which holds M group and + data type is float32 or float64. The second is list consist of + 4 elements shared by all boxes and data type is float32 or float64. + Other is None and not involved in calculation. + target_box(Tensor): This input can be a 2-D LoDTensor with shape + [N, 4] when code_type is 'encode_center_size'. This input also can + be a 3-D Tensor with shape [N, M, 4] when code_type is + 'decode_center_size'. Each box is represented as + [xmin, ymin, xmax, ymax]. The data type is float32 or float64. + code_type(str): The code type used with the target box. It can be + `encode_center_size` or `decode_center_size`. `encode_center_size` + by default. + box_normalized(bool): Whether treat the priorbox as a normalized box. + Set true by default. + axis(int): Which axis in PriorBox to broadcast for box decode, + for example, if axis is 0 and TargetBox has shape [N, M, 4] and + PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4] + for decoding. It is only valid when code type is + `decode_center_size`. Set 0 by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tensor: + output_box(Tensor): When code_type is 'encode_center_size', the + output tensor of box_coder_op with shape [N, M, 4] representing the + result of N target boxes encoded with M Prior boxes and variances. + When code_type is 'decode_center_size', N represents the batch size + and M represents the number of decoded boxes. + + Examples: + + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + # For encode + prior_box_encode = paddle.static.data(name='prior_box_encode', + shape=[512, 4], + dtype='float32') + target_box_encode = paddle.static.data(name='target_box_encode', + shape=[81, 4], + dtype='float32') + output_encode = ops.box_coder(prior_box=prior_box_encode, + prior_box_var=[0.1,0.1,0.2,0.2], + target_box=target_box_encode, + code_type="encode_center_size") + # For decode + prior_box_decode = paddle.static.data(name='prior_box_decode', + shape=[512, 4], + dtype='float32') + target_box_decode = paddle.static.data(name='target_box_decode', + shape=[512, 81, 4], + dtype='float32') + output_decode = ops.box_coder(prior_box=prior_box_decode, + prior_box_var=[0.1,0.1,0.2,0.2], + target_box=target_box_decode, + code_type="decode_center_size", + box_normalized=False, + axis=1) + """ + check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'], + 'box_coder') + check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'], + 'box_coder') + + if in_dygraph_mode(): + if isinstance(prior_box_var, Variable): + output_box = core.ops.box_coder( + prior_box, prior_box_var, target_box, "code_type", code_type, + "box_normalized", box_normalized, "axis", axis) + + elif isinstance(prior_box_var, list): + output_box = core.ops.box_coder( + prior_box, None, target_box, "code_type", code_type, + "box_normalized", box_normalized, "axis", axis, "variance", + prior_box_var) + else: + raise TypeError( + "Input variance of box_coder must be Variable or list") + return output_box + else: + helper = LayerHelper("box_coder", **locals()) + + output_box = helper.create_variable_for_type_inference( + dtype=prior_box.dtype) + + inputs = {"PriorBox": prior_box, "TargetBox": target_box} + attrs = { + "code_type": code_type, + "box_normalized": box_normalized, + "axis": axis + } + if isinstance(prior_box_var, Variable): + inputs['PriorBoxVar'] = prior_box_var + elif isinstance(prior_box_var, list): + attrs['variance'] = prior_box_var + else: + raise TypeError( + "Input variance of box_coder must be Variable or list") + helper.append_op( + type="box_coder", + inputs=inputs, + attrs=attrs, + outputs={"OutputBox": output_box}) + return output_box + + +@paddle.jit.not_to_static +def generate_proposals(scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0, + pixel_offset=False, + return_rois_num=False, + name=None): + """ + **Generate proposal Faster-RCNN** + This operation proposes RoIs according to each box with their + probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores + to be an object are the output of RPN. Final proposals + could be used to train detection net. + For generating proposals, this operation performs following steps: + 1. Transposes and resizes scores and bbox_deltas in size of + (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + Args: + scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents + the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and + width of the feature map. The data type must be float32. + bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W] + represents the difference between predicted box location and + anchor location. The data type must be float32. + im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the + origin image size or input size. The data type can be float32 or + float64. + anchors(Tensor): A 4-D Tensor represents the anchors with a layout + of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is + in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32. + variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of + [H, W, num_priors, 4]. Each variance is in + (xcenter, ycenter, w, h) format. The data type must be float32. + pre_nms_top_n(float): Number of total bboxes to be kept per + image before NMS. The data type must be float32. `6000` by default. + post_nms_top_n(float): Number of total bboxes to be kept per + image after NMS. The data type must be float32. `1000` by default. + nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default. + min_size(float): Remove predicted boxes with either height or + width < min_size. The data type must be float32. `0.1` by default. + eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, + `adaptive_threshold = adaptive_threshold * eta` in each iteration. + return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's + num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents + the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. + 'False' by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + tuple: + A tuple with format ``(rpn_rois, rpn_roi_probs)``. + - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. + - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32') + bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32') + im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32') + anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32') + variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32') + rois, roi_probs = ops.generate_proposals(scores, bbox_deltas, + im_shape, anchors, variances) + """ + if in_dygraph_mode(): + assert return_rois_num, "return_rois_num should be True in dygraph mode." + attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, + 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, + 'pixel_offset', pixel_offset) + rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals_v2( + scores, bbox_deltas, im_shape, anchors, variances, *attrs) + return rpn_rois, rpn_roi_probs, rpn_rois_num + + else: + helper = LayerHelper('generate_proposals_v2', **locals()) + + check_variable_and_dtype(scores, 'scores', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'], + 'generate_proposals_v2') + check_variable_and_dtype(anchors, 'anchors', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(variances, 'variances', ['float32'], + 'generate_proposals_v2') + + rpn_rois = helper.create_variable_for_type_inference( + dtype=bbox_deltas.dtype) + rpn_roi_probs = helper.create_variable_for_type_inference( + dtype=scores.dtype) + outputs = { + 'RpnRois': rpn_rois, + 'RpnRoiProbs': rpn_roi_probs, + } + if return_rois_num: + rpn_rois_num = helper.create_variable_for_type_inference( + dtype='int32') + rpn_rois_num.stop_gradient = True + outputs['RpnRoisNum'] = rpn_rois_num + + helper.append_op( + type="generate_proposals_v2", + inputs={ + 'Scores': scores, + 'BboxDeltas': bbox_deltas, + 'ImShape': im_shape, + 'Anchors': anchors, + 'Variances': variances + }, + attrs={ + 'pre_nms_topN': pre_nms_top_n, + 'post_nms_topN': post_nms_top_n, + 'nms_thresh': nms_thresh, + 'min_size': min_size, + 'eta': eta, + 'pixel_offset': pixel_offset + }, + outputs=outputs) + rpn_rois.stop_gradient = True + rpn_roi_probs.stop_gradient = True + + return rpn_rois, rpn_roi_probs, rpn_rois_num + + +def sigmoid_cross_entropy_with_logits(input, + label, + ignore_index=-100, + normalize=False): + output = F.binary_cross_entropy_with_logits(input, label, reduction='none') + mask_tensor = paddle.cast(label != ignore_index, 'float32') + output = paddle.multiply(output, mask_tensor) + if normalize: + sum_valid_mask = paddle.sum(mask_tensor) + output = output / sum_valid_mask + return output + + +def smooth_l1(input, label, inside_weight=None, outside_weight=None, + sigma=None): + input_new = paddle.multiply(input, inside_weight) + label_new = paddle.multiply(label, inside_weight) + delta = 1 / (sigma * sigma) + out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta) + out = paddle.multiply(out, outside_weight) + out = out / delta + out = paddle.reshape(out, shape=[out.shape[0], -1]) + out = paddle.sum(out, axis=1) + return out diff --git a/ppdet/modeling/post_process.py b/ppdet/modeling/post_process.py new file mode 100644 index 0000000..ca69ac3 --- /dev/null +++ b/ppdet/modeling/post_process.py @@ -0,0 +1,303 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from ppdet.modeling.bbox_utils import nonempty_bbox, rbox2poly +from . import ops +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +__all__ = ['BBoxPostProcess', 'MaskPostProcess', 'FCOSPostProcess'] + + +@register +class BBoxPostProcess(object): + __shared__ = ['num_classes'] + __inject__ = ['decode', 'nms'] + + def __init__(self, num_classes=80, decode=None, nms=None): + super(BBoxPostProcess, self).__init__() + self.num_classes = num_classes + self.decode = decode + self.nms = nms + + def __call__(self, head_out, rois, im_shape, scale_factor): + """ + Decode the bbox and do NMS if needed. + + Args: + head_out (tuple): bbox_pred and cls_prob of bbox_head output. + rois (tuple): roi and rois_num of rpn_head output. + im_shape (Tensor): The shape of the input image. + scale_factor (Tensor): The scale factor of the input image. + Returns: + bbox_pred (Tensor): The output prediction with shape [N, 6], including + labels, scores and bboxes. The size of bboxes are corresponding + to the input image, the bboxes may be used in other branch. + bbox_num (Tensor): The number of prediction boxes of each batch with + shape [1], and is N. + """ + if self.nms is not None: + bboxes, score = self.decode(head_out, rois, im_shape, scale_factor) + bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes) + else: + bbox_pred, bbox_num = self.decode(head_out, rois, im_shape, + scale_factor) + + # Prevent empty bbox_pred from decode or NMS. + # Bboxes and score before NMS may be empty due to the score threshold. + if bbox_pred.shape[0] == 0: + bbox_pred = paddle.to_tensor( + np.array( + [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) + bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) + return bbox_pred, bbox_num + + def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): + """ + Rescale, clip and filter the bbox from the output of NMS to + get final prediction. + + Notes: + Currently only support bs = 1. + + Args: + bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode + and NMS, including labels, scores and bboxes. + bbox_num (Tensor): The number of prediction boxes of each batch with + shape [1], and is N. + im_shape (Tensor): The shape of the input image. + scale_factor (Tensor): The scale factor of the input image. + Returns: + pred_result (Tensor): The final prediction results with shape [N, 6] + including labels, scores and bboxes. + """ + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + + origin_shape_list = [] + scale_factor_list = [] + # scale_factor: scale_y, scale_x + for i in range(bbox_num.shape[0]): + expand_shape = paddle.expand(origin_shape[i:i + 1, :], + [bbox_num[i], 2]) + scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] + scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) + expand_scale = paddle.expand(scale, [bbox_num[i], 4]) + origin_shape_list.append(expand_shape) + scale_factor_list.append(expand_scale) + + self.origin_shape_list = paddle.concat(origin_shape_list) + scale_factor_list = paddle.concat(scale_factor_list) + + # bboxes: [N, 6], label, score, bbox + pred_label = bboxes[:, 0:1] + pred_score = bboxes[:, 1:2] + pred_bbox = bboxes[:, 2:] + # rescale bbox to original image + scaled_bbox = pred_bbox / scale_factor_list + origin_h = self.origin_shape_list[:, 0] + origin_w = self.origin_shape_list[:, 1] + zeros = paddle.zeros_like(origin_h) + # clip bbox to [0, original_size] + x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros) + y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros) + x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros) + y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros) + pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1) + # filter empty bbox + keep_mask = nonempty_bbox(pred_bbox, return_mask=True) + keep_mask = paddle.unsqueeze(keep_mask, [1]) + pred_label = paddle.where(keep_mask, pred_label, + paddle.ones_like(pred_label) * -1) + pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1) + return pred_result + + def get_origin_shape(self, ): + return self.origin_shape_list + + +@register +class MaskPostProcess(object): + def __init__(self, binary_thresh=0.5): + super(MaskPostProcess, self).__init__() + self.binary_thresh = binary_thresh + + def paste_mask(self, masks, boxes, im_h, im_w): + """ + Paste the mask prediction to the original image. + """ + x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1) + masks = paddle.unsqueeze(masks, [0, 1]) + img_y = paddle.arange(0, im_h, dtype='float32') + 0.5 + img_x = paddle.arange(0, im_w, dtype='float32') + 0.5 + img_y = (img_y - y0) / (y1 - y0) * 2 - 1 + img_x = (img_x - x0) / (x1 - x0) * 2 - 1 + img_x = paddle.unsqueeze(img_x, [1]) + img_y = paddle.unsqueeze(img_y, [2]) + N = boxes.shape[0] + + gx = paddle.expand(img_x, [N, img_y.shape[1], img_x.shape[2]]) + gy = paddle.expand(img_y, [N, img_y.shape[1], img_x.shape[2]]) + # TODO: Because paddle.expand transform error when dygraph + # to static, use reshape to avoid mistakes. + gx = paddle.reshape(gx, [N, img_y.shape[1], img_x.shape[2]]) + gy = paddle.reshape(gy, [N, img_y.shape[1], img_x.shape[2]]) + grid = paddle.stack([gx, gy], axis=3) + img_masks = F.grid_sample(masks, grid, align_corners=False) + return img_masks[:, 0] + + def __call__(self, mask_out, bboxes, bbox_num, origin_shape): + """ + Decode the mask_out and paste the mask to the origin image. + + Args: + mask_out (Tensor): mask_head output with shape [N, 28, 28]. + bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode + and NMS, including labels, scores and bboxes. + bbox_num (Tensor): The number of prediction boxes of each batch with + shape [1], and is N. + origin_shape (Tensor): The origin shape of the input image, the tensor + shape is [N, 2], and each row is [h, w]. + Returns: + pred_result (Tensor): The final prediction mask results with shape + [N, h, w] in binary mask style. + """ + num_mask = mask_out.shape[0] + origin_shape = paddle.cast(origin_shape, 'int32') + # TODO: support bs > 1 and mask output dtype is bool + pred_result = paddle.zeros( + [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='int32') + if bbox_num == 1 and bboxes[0][0] == -1: + return pred_result + + # TODO: optimize chunk paste + pred_result = [] + for i in range(bboxes.shape[0]): + im_h, im_w = origin_shape[i][0], origin_shape[i][1] + pred_mask = self.paste_mask(mask_out[i], bboxes[i:i + 1, 2:], im_h, + im_w) + pred_mask = pred_mask >= self.binary_thresh + pred_mask = paddle.cast(pred_mask, 'int32') + pred_result.append(pred_mask) + pred_result = paddle.concat(pred_result) + return pred_result + + +@register +class FCOSPostProcess(object): + __inject__ = ['decode', 'nms'] + + def __init__(self, decode=None, nms=None): + super(FCOSPostProcess, self).__init__() + self.decode = decode + self.nms = nms + + def __call__(self, fcos_head_outs, scale_factor): + locations, cls_logits, bboxes_reg, centerness = fcos_head_outs + bboxes, score = self.decode(locations, cls_logits, bboxes_reg, + centerness, scale_factor) + bbox_pred, bbox_num, _ = self.nms(bboxes, score) + return bbox_pred, bbox_num + + +@register +class S2ANetBBoxPostProcess(object): + __inject__ = ['nms'] + + def __init__(self, nms_pre=2000, min_bbox_size=0, nms=None): + super(S2ANetBBoxPostProcess, self).__init__() + self.nms_pre = nms_pre + self.min_bbox_size = min_bbox_size + self.nms = nms + self.origin_shape_list = [] + + def get_prediction(self, pred_scores, pred_bboxes, im_shape, scale_factor): + """ + pred_scores : [N, M] score + pred_bboxes : [N, 5] xc, yc, w, h, a + im_shape : [N, 2] im_shape + scale_factor : [N, 2] scale_factor + """ + # TODO: support bs>1 + pred_ploys = rbox2poly(pred_bboxes.numpy()) + pred_ploys = paddle.to_tensor(pred_ploys) + pred_ploys = paddle.reshape( + pred_ploys, [1, pred_ploys.shape[0], pred_ploys.shape[1]]) + + pred_scores = paddle.to_tensor(pred_scores) + # pred_scores [NA, 16] --> [16, NA] + pred_scores = paddle.transpose(pred_scores, [1, 0]) + pred_scores = paddle.reshape( + pred_scores, [1, pred_scores.shape[0], pred_scores.shape[1]]) + pred_cls_score_bbox, bbox_num, index = self.nms(pred_ploys, pred_scores) + + # post process scale + # result [n, 10] + if bbox_num > 0: + pred_bbox, bbox_num = self.post_process(pred_cls_score_bbox[:, 2:], + bbox_num, im_shape[0], + scale_factor[0]) + + pred_cls_score_bbox = paddle.concat( + [pred_cls_score_bbox[:, 0:2], pred_bbox], axis=1) + else: + pred_cls_score_bbox = paddle.to_tensor( + np.array( + [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], + dtype='float32')) + bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) + return pred_cls_score_bbox, bbox_num, index + + def post_process(self, bboxes, bbox_num, im_shape, scale_factor): + """ + Rescale, clip and filter the bbox from the output of NMS to + get final prediction. + + Args: + bboxes(Tensor): bboxes [N, 8] + bbox_num(Tensor): bbox_num + im_shape(Tensor): [1 2] + scale_factor(Tensor): [1 2] + Returns: + bbox_pred(Tensor): The output is the prediction with shape [N, 8] + including labels, scores and bboxes. The size of + bboxes are corresponding to the original image. + """ + + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + + origin_h = origin_shape[0] + origin_w = origin_shape[1] + + bboxes[:, 0::2] = bboxes[:, 0::2] / scale_factor[0] + bboxes[:, 1::2] = bboxes[:, 1::2] / scale_factor[1] + + zeros = paddle.zeros_like(origin_h) + x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros) + y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros) + x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros) + y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros) + x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros) + y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros) + x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros) + y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros) + bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) + bboxes = (bbox, bbox_num) + return bboxes diff --git a/ppdet/modeling/proposal_generator/__init__.py b/ppdet/modeling/proposal_generator/__init__.py new file mode 100644 index 0000000..9fb518f --- /dev/null +++ b/ppdet/modeling/proposal_generator/__init__.py @@ -0,0 +1,2 @@ +from . import rpn_head +from .rpn_head import * diff --git a/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..938cdf3 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..c441ac9 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-38.pyc new file mode 100644 index 0000000..aeec124 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-39.pyc new file mode 100644 index 0000000..ac4c26f Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/anchor_generator.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-38.pyc new file mode 100644 index 0000000..3fde371 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-39.pyc new file mode 100644 index 0000000..6745393 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/proposal_generator.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-38.pyc new file mode 100644 index 0000000..3d9e5de Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-39.pyc new file mode 100644 index 0000000..dbd9765 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/rpn_head.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/target.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/target.cpython-38.pyc new file mode 100644 index 0000000..26950ee Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/target.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/target.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/target.cpython-39.pyc new file mode 100644 index 0000000..820617b Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/target.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-38.pyc b/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-38.pyc new file mode 100644 index 0000000..cdb1818 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-38.pyc differ diff --git a/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-39.pyc b/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-39.pyc new file mode 100644 index 0000000..6b79736 Binary files /dev/null and b/ppdet/modeling/proposal_generator/__pycache__/target_layer.cpython-39.pyc differ diff --git a/ppdet/modeling/proposal_generator/anchor_generator.py b/ppdet/modeling/proposal_generator/anchor_generator.py new file mode 100644 index 0000000..8088ffa --- /dev/null +++ b/ppdet/modeling/proposal_generator/anchor_generator.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register +from .. import ops + + +@register +class AnchorGenerator(nn.Layer): + """ + Generate anchors according to the feature maps + + Args: + anchor_sizes (list[float] | list[list[float]]): The anchor sizes at + each feature point. list[float] means all feature levels share the + same sizes. list[list[float]] means the anchor sizes for + each level. The sizes stand for the scale of input size. + aspect_ratios (list[float] | list[list[float]]): The aspect ratios at + each feature point. list[float] means all feature levels share the + same ratios. list[list[float]] means the aspect ratios for + each level. + strides (list[float]): The strides of feature maps which generate + anchors + offset (float): The offset of the coordinate of anchors, default 0. + + """ + + def __init__(self, + anchor_sizes=[32, 64, 128, 256, 512], + aspect_ratios=[0.5, 1.0, 2.0], + strides=[16.0], + variance=[1.0, 1.0, 1.0, 1.0], + offset=0.): + super(AnchorGenerator, self).__init__() + self.anchor_sizes = anchor_sizes + self.aspect_ratios = aspect_ratios + self.strides = strides + self.variance = variance + self.cell_anchors = self._calculate_anchors(len(strides)) + self.offset = offset + + def _broadcast_params(self, params, num_features): + if not isinstance(params[0], (list, tuple)): # list[float] + return [params] * num_features + if len(params) == 1: + return list(params) * num_features + return params + + def generate_cell_anchors(self, sizes, aspect_ratios): + anchors = [] + for size in sizes: + area = size**2.0 + for aspect_ratio in aspect_ratios: + w = math.sqrt(area / aspect_ratio) + h = aspect_ratio * w + x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0 + anchors.append([x0, y0, x1, y1]) + return paddle.to_tensor(anchors, dtype='float32') + + def _calculate_anchors(self, num_features): + sizes = self._broadcast_params(self.anchor_sizes, num_features) + aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features) + cell_anchors = [ + self.generate_cell_anchors(s, a) + for s, a in zip(sizes, aspect_ratios) + ] + [ + self.register_buffer( + t.name, t, persistable=False) for t in cell_anchors + ] + return cell_anchors + + def _create_grid_offsets(self, size, stride, offset): + grid_height, grid_width = size[0], size[1] + shifts_x = paddle.arange( + offset * stride, grid_width * stride, step=stride, dtype='float32') + shifts_y = paddle.arange( + offset * stride, grid_height * stride, step=stride, dtype='float32') + shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x) + shift_x = paddle.reshape(shift_x, [-1]) + shift_y = paddle.reshape(shift_y, [-1]) + return shift_x, shift_y + + def _grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip(grid_sizes, self.strides, + self.cell_anchors): + shift_x, shift_y = self._create_grid_offsets(size, stride, + self.offset) + shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1) + shifts = paddle.reshape(shifts, [-1, 1, 4]) + base_anchors = paddle.reshape(base_anchors, [1, -1, 4]) + + anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4])) + + return anchors + + def forward(self, input): + grid_sizes = [paddle.shape(feature_map)[-2:] for feature_map in input] + anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) + return anchors_over_all_feature_maps + + @property + def num_anchors(self): + """ + Returns: + int: number of anchors at every pixel + location, on that feature map. + For example, if at every pixel we use anchors of 3 aspect + ratios and 5 sizes, the number of anchors is 15. + For FPN models, `num_anchors` on every feature map is the same. + """ + return len(self.cell_anchors[0]) diff --git a/ppdet/modeling/proposal_generator/proposal_generator.py b/ppdet/modeling/proposal_generator/proposal_generator.py new file mode 100644 index 0000000..12518e4 --- /dev/null +++ b/ppdet/modeling/proposal_generator/proposal_generator.py @@ -0,0 +1,81 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register, serializable +from .. import ops + + +@register +@serializable +class ProposalGenerator(object): + """ + Proposal generation module + + For more details, please refer to the document of generate_proposals + in ppdet/modeing/ops.py + + Args: + pre_nms_top_n (int): Number of total bboxes to be kept per + image before NMS. default 6000 + post_nms_top_n (int): Number of total bboxes to be kept per + image after NMS. default 1000 + nms_thresh (float): Threshold in NMS. default 0.5 + min_size (flaot): Remove predicted boxes with either height or + width < min_size. default 0.1 + eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, + `adaptive_threshold = adaptive_threshold * eta` in each iteration. + default 1. + topk_after_collect (bool): whether to adopt topk after batch + collection. If topk_after_collect is true, box filter will not be + used after NMS at each image in proposal generation. default false + """ + + def __init__(self, + pre_nms_top_n=12000, + post_nms_top_n=2000, + nms_thresh=.5, + min_size=.1, + eta=1., + topk_after_collect=False): + super(ProposalGenerator, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + self.eta = eta + self.topk_after_collect = topk_after_collect + + def __call__(self, scores, bbox_deltas, anchors, im_shape): + + top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n + variances = paddle.ones_like(anchors) + rpn_rois, rpn_rois_prob, rpn_rois_num = ops.generate_proposals( + scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=self.pre_nms_top_n, + post_nms_top_n=top_n, + nms_thresh=self.nms_thresh, + min_size=self.min_size, + eta=self.eta, + return_rois_num=True) + return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n diff --git a/ppdet/modeling/proposal_generator/rpn_head.py b/ppdet/modeling/proposal_generator/rpn_head.py new file mode 100644 index 0000000..2b1e6c7 --- /dev/null +++ b/ppdet/modeling/proposal_generator/rpn_head.py @@ -0,0 +1,253 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling import ops + +from .anchor_generator import AnchorGenerator +from .target_layer import RPNTargetAssign +from .proposal_generator import ProposalGenerator + + +class RPNFeat(nn.Layer): + """ + Feature extraction in RPN head + + Args: + in_channel (int): Input channel + out_channel (int): Output channel + """ + + def __init__(self, in_channel=1024, out_channel=1024): + super(RPNFeat, self).__init__() + # rpn feat is shared with each level + self.rpn_conv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=3, + padding=1, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0., std=0.01))) + + def forward(self, feats): + rpn_feats = [] + for feat in feats: + rpn_feats.append(F.relu(self.rpn_conv(feat))) + return rpn_feats + + +@register +class RPNHead(nn.Layer): + """ + Region Proposal Network + + Args: + anchor_generator (dict): configure of anchor generation + rpn_target_assign (dict): configure of rpn targets assignment + train_proposal (dict): configure of proposals generation + at the stage of training + test_proposal (dict): configure of proposals generation + at the stage of prediction + in_channel (int): channel of input feature maps which can be + derived by from_config + """ + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + rpn_target_assign=RPNTargetAssign().__dict__, + train_proposal=ProposalGenerator(12000, 2000).__dict__, + test_proposal=ProposalGenerator().__dict__, + in_channel=1024): + super(RPNHead, self).__init__() + self.anchor_generator = anchor_generator + self.rpn_target_assign = rpn_target_assign + self.train_proposal = train_proposal + self.test_proposal = test_proposal + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGenerator(**anchor_generator) + if isinstance(rpn_target_assign, dict): + self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) + if isinstance(train_proposal, dict): + self.train_proposal = ProposalGenerator(**train_proposal) + if isinstance(test_proposal, dict): + self.test_proposal = ProposalGenerator(**test_proposal) + + num_anchors = self.anchor_generator.num_anchors + self.rpn_feat = RPNFeat(in_channel, in_channel) + # rpn head is shared with each level + # rpn roi classification scores + self.rpn_rois_score = nn.Conv2D( + in_channels=in_channel, + out_channels=num_anchors, + kernel_size=1, + padding=0, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0., std=0.01))) + + # rpn roi bbox regression deltas + self.rpn_rois_delta = nn.Conv2D( + in_channels=in_channel, + out_channels=4 * num_anchors, + kernel_size=1, + padding=0, + weight_attr=paddle.ParamAttr(initializer=Normal( + mean=0., std=0.01))) + + @classmethod + def from_config(cls, cfg, input_shape): + # FPN share same rpn head + if isinstance(input_shape, (list, tuple)): + input_shape = input_shape[0] + return {'in_channel': input_shape.channels} + + def forward(self, feats, inputs): + rpn_feats = self.rpn_feat(feats) + scores = [] + deltas = [] + + for rpn_feat in rpn_feats: + rrs = self.rpn_rois_score(rpn_feat) + rrd = self.rpn_rois_delta(rpn_feat) + scores.append(rrs) + deltas.append(rrd) + + anchors = self.anchor_generator(rpn_feats) + + # TODO: Fix batch_size > 1 when testing. + if self.training: + batch_size = inputs['im_shape'].shape[0] + else: + batch_size = 1 + + rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs, + batch_size) + if self.training: + loss = self.get_loss(scores, deltas, anchors, inputs) + return rois, rois_num, loss + else: + return rois, rois_num, None + + def _gen_proposal(self, scores, bbox_deltas, anchors, inputs, batch_size): + """ + scores (list[Tensor]): Multi-level scores prediction + bbox_deltas (list[Tensor]): Multi-level deltas prediction + anchors (list[Tensor]): Multi-level anchors + inputs (dict): ground truth info + """ + prop_gen = self.train_proposal if self.training else self.test_proposal + im_shape = inputs['im_shape'] + rpn_rois_list = [[] for i in range(batch_size)] + rpn_prob_list = [[] for i in range(batch_size)] + rpn_rois_num_list = [[] for i in range(batch_size)] + # Generate proposals for each level and each batch. + # Discard batch-computing to avoid sorting bbox cross different batches. + for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, anchors): + for i in range(batch_size): + rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen( + scores=rpn_score[i:i + 1], + bbox_deltas=rpn_delta[i:i + 1], + anchors=anchor, + im_shape=im_shape[i:i + 1]) + if rpn_rois.shape[0] > 0: + rpn_rois_list[i].append(rpn_rois) + rpn_prob_list[i].append(rpn_rois_prob) + rpn_rois_num_list[i].append(rpn_rois_num) + + # Collect multi-level proposals for each batch + # Get 'topk' of them as final output + rois_collect = [] + rois_num_collect = [] + for i in range(batch_size): + if len(scores) > 1: + rpn_rois = paddle.concat(rpn_rois_list[i]) + rpn_prob = paddle.concat(rpn_prob_list[i]).flatten() + if rpn_prob.shape[0] > post_nms_top_n: + topk_prob, topk_inds = paddle.topk(rpn_prob, post_nms_top_n) + topk_rois = paddle.gather(rpn_rois, topk_inds) + else: + topk_rois = rpn_rois + topk_prob = rpn_prob + else: + topk_rois = rpn_rois_list[i][0] + topk_prob = rpn_prob_list[i][0].flatten() + rois_collect.append(topk_rois) + rois_num_collect.append(paddle.shape(topk_rois)[0]) + rois_num_collect = paddle.concat(rois_num_collect) + + return rois_collect, rois_num_collect + + def get_loss(self, pred_scores, pred_deltas, anchors, inputs): + """ + pred_scores (list[Tensor]): Multi-level scores prediction + pred_deltas (list[Tensor]): Multi-level deltas prediction + anchors (list[Tensor]): Multi-level anchors + inputs (dict): ground truth info, including im, gt_bbox, gt_score + """ + anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors] + anchors = paddle.concat(anchors) + + scores = [ + paddle.reshape( + paddle.transpose( + v, perm=[0, 2, 3, 1]), + shape=(v.shape[0], -1, 1)) for v in pred_scores + ] + scores = paddle.concat(scores, axis=1) + + deltas = [ + paddle.reshape( + paddle.transpose( + v, perm=[0, 2, 3, 1]), + shape=(v.shape[0], -1, 4)) for v in pred_deltas + ] + deltas = paddle.concat(deltas, axis=1) + + score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs, + anchors) + + scores = paddle.reshape(x=scores, shape=(-1, )) + deltas = paddle.reshape(x=deltas, shape=(-1, 4)) + + score_tgt = paddle.concat(score_tgt) + score_tgt.stop_gradient = True + + pos_mask = score_tgt == 1 + pos_ind = paddle.nonzero(pos_mask) + + valid_mask = score_tgt >= 0 + valid_ind = paddle.nonzero(valid_mask) + + # cls loss + score_pred = paddle.gather(scores, valid_ind) + score_label = paddle.gather(score_tgt, valid_ind).cast('float32') + score_label.stop_gradient = True + loss_rpn_cls = F.binary_cross_entropy_with_logits( + logit=score_pred, label=score_label, reduction="sum") + + # reg loss + loc_pred = paddle.gather(deltas, pos_ind) + loc_tgt = paddle.concat(loc_tgt) + loc_tgt = paddle.gather(loc_tgt, pos_ind) + loc_tgt.stop_gradient = True + loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum() + return { + 'loss_rpn_cls': loss_rpn_cls / norm, + 'loss_rpn_reg': loss_rpn_reg / norm + } diff --git a/ppdet/modeling/proposal_generator/target.py b/ppdet/modeling/proposal_generator/target.py new file mode 100644 index 0000000..8e45ef3 --- /dev/null +++ b/ppdet/modeling/proposal_generator/target.py @@ -0,0 +1,606 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import six +import math +import numpy as np +import paddle +from ..bbox_utils import bbox2delta, bbox_overlaps +import copy + + +def rpn_anchor_target(anchors, + gt_boxes, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True, + batch_size=1, + weights=[1., 1., 1., 1.]): + tgt_labels = [] + tgt_bboxes = [] + + tgt_deltas = [] + for i in range(batch_size): + gt_bbox = gt_boxes[i] + + # Step1: match anchor and gt_bbox + matches, match_labels = label_box( + anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True) + # Step2: sample anchor + fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im, + rpn_fg_fraction, 0, use_random) + # Fill with the ignore label (-1), then set positive and negative labels + labels = paddle.full(match_labels.shape, -1, dtype='int32') + labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds)) + labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds)) + # Step3: make output + matched_gt_boxes = paddle.gather(gt_bbox, matches) + + tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights) + labels.stop_gradient = True + matched_gt_boxes.stop_gradient = True + tgt_delta.stop_gradient = True + tgt_labels.append(labels) + tgt_bboxes.append(matched_gt_boxes) + tgt_deltas.append(tgt_delta) + + return tgt_labels, tgt_bboxes, tgt_deltas + + +def label_box(anchors, gt_boxes, positive_overlap, negative_overlap, + allow_low_quality): + iou = bbox_overlaps(gt_boxes, anchors) + if iou.numel() == 0: + default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64') + default_match_labels = paddle.full((iou.shape[1], ), -1, dtype='int32') + return default_matches, default_match_labels + matched_vals, matches = paddle.topk(iou, k=1, axis=0) + match_labels = paddle.full(matches.shape, -1, dtype='int32') + match_labels = paddle.where(matched_vals < negative_overlap, + paddle.zeros_like(match_labels), match_labels) + match_labels = paddle.where(matched_vals >= positive_overlap, + paddle.ones_like(match_labels), match_labels) + if allow_low_quality: + highest_quality_foreach_gt = iou.max(axis=1, keepdim=True) + pred_inds_with_highest_quality = paddle.logical_and( + iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum( + 0, keepdim=True) + match_labels = paddle.where(pred_inds_with_highest_quality > 0, + paddle.ones_like(match_labels), + match_labels) + + matches = matches.flatten() + match_labels = match_labels.flatten() + return matches, match_labels + + +def subsample_labels(labels, + num_samples, + fg_fraction, + bg_label=0, + use_random=True): + positive = paddle.nonzero( + paddle.logical_and(labels != -1, labels != bg_label)) + negative = paddle.nonzero(labels == bg_label) + + positive = positive.cast('int32').flatten() + negative = negative.cast('int32').flatten() + + fg_num = int(num_samples * fg_fraction) + fg_num = min(positive.numel(), fg_num) + bg_num = num_samples - fg_num + bg_num = min(negative.numel(), bg_num) + # randomly select positive and negative examples + fg_perm = paddle.randperm(positive.numel(), dtype='int32') + fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num]) + bg_perm = paddle.randperm(negative.numel(), dtype='int32') + bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num]) + if use_random: + fg_inds = paddle.gather(positive, fg_perm) + bg_inds = paddle.gather(negative, bg_perm) + else: + fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num]) + bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num]) + return fg_inds, bg_inds + + +def generate_proposal_target(rpn_rois, + gt_classes, + gt_boxes, + batch_size_per_im, + fg_fraction, + fg_thresh, + bg_thresh, + num_classes, + use_random=True, + is_cascade=False, + cascade_iou=0.5): + + rois_with_gt = [] + tgt_labels = [] + tgt_bboxes = [] + tgt_gt_inds = [] + new_rois_num = [] + + # In cascade rcnn, the threshold for foreground and background + # is used from cascade_iou + fg_thresh = cascade_iou if is_cascade else fg_thresh + bg_thresh = cascade_iou if is_cascade else bg_thresh + for i, rpn_roi in enumerate(rpn_rois): + gt_bbox = gt_boxes[i] + gt_class = gt_classes[i] + + # Concat RoIs and gt boxes except cascade rcnn + if not is_cascade: + bbox = paddle.concat([rpn_roi, gt_bbox]) + else: + bbox = rpn_roi + + # Step1: label bbox + matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh, + False) + # Step2: sample bbox + sampled_inds, sampled_gt_classes = sample_bbox( + matches, match_labels, gt_class, batch_size_per_im, fg_fraction, + num_classes, use_random, is_cascade) + + # Step3: make output + rois_per_image = bbox if is_cascade else paddle.gather(bbox, + sampled_inds) + sampled_gt_ind = matches if is_cascade else paddle.gather(matches, + sampled_inds) + sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) + + rois_per_image.stop_gradient = True + sampled_gt_ind.stop_gradient = True + sampled_bbox.stop_gradient = True + tgt_labels.append(sampled_gt_classes) + tgt_bboxes.append(sampled_bbox) + rois_with_gt.append(rois_per_image) + tgt_gt_inds.append(sampled_gt_ind) + new_rois_num.append(paddle.shape(sampled_inds)[0]) + new_rois_num = paddle.concat(new_rois_num) + return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num + + +def sample_bbox(matches, + match_labels, + gt_classes, + batch_size_per_im, + fg_fraction, + num_classes, + use_random=True, + is_cascade=False): + gt_classes = paddle.gather(gt_classes, matches) + gt_classes = paddle.where(match_labels == 0, + paddle.ones_like(gt_classes) * num_classes, + gt_classes) + gt_classes = paddle.where(match_labels == -1, + paddle.ones_like(gt_classes) * -1, gt_classes) + if is_cascade: + return matches, gt_classes + rois_per_image = int(batch_size_per_im) + + fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction, + num_classes, use_random) + sampled_inds = paddle.concat([fg_inds, bg_inds]) + sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) + return sampled_inds, sampled_gt_classes + + +def _strip_pad(gt_polys): + new_gt_polys = [] + for i in range(gt_polys.shape[0]): + gt_segs = [] + for j in range(gt_polys[i].shape[0]): + new_poly = [] + polys = gt_polys[i][j] + for ii in range(polys.shape[0]): + x, y = polys[ii] + if (x == -1 and y == -1): + continue + elif (x >= 0 or y >= 0): + new_poly.extend([x, y]) # array, one poly + if len(new_poly) > 6: + gt_segs.append(np.array(new_poly).astype('float64')) + new_gt_polys.append(gt_segs) + return new_gt_polys + + +def polygons_to_mask(polygons, height, width): + """ + Args: + polygons (list[ndarray]): each array has shape (Nx2,) + height, width (int) + Returns: + ndarray: a bool mask of shape (height, width) + """ + import pycocotools.mask as mask_util + assert len(polygons) > 0, "COCOAPI does not support empty polygons" + rles = mask_util.frPyObjects(polygons, height, width) + rle = mask_util.merge(rles) + return mask_util.decode(rle).astype(np.bool) + + +def rasterize_polygons_within_box(poly, box, resolution): + w, h = box[2] - box[0], box[3] - box[1] + + polygons = copy.deepcopy(poly) + for p in polygons: + p[0::2] = p[0::2] - box[0] + p[1::2] = p[1::2] - box[1] + + ratio_h = resolution / max(h, 0.1) + ratio_w = resolution / max(w, 0.1) + + if ratio_h == ratio_w: + for p in polygons: + p *= ratio_h + else: + for p in polygons: + p[0::2] *= ratio_w + p[1::2] *= ratio_h + + # 3. Rasterize the polygons with coco api + mask = polygons_to_mask(polygons, resolution, resolution) + mask = paddle.to_tensor(mask, dtype='int32') + return mask + + +def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, + num_classes, resolution): + mask_rois = [] + mask_rois_num = [] + tgt_masks = [] + tgt_classes = [] + mask_index = [] + tgt_weights = [] + for k in range(len(rois)): + has_fg = True + rois_per_im = rois[k] + gt_segms_per_im = gt_segms[k] + labels_per_im = labels_int32[k] + # select rois labeled with foreground + fg_inds = paddle.nonzero( + paddle.logical_and(labels_per_im != -1, labels_per_im != + num_classes)) + + # generate fake roi if foreground is empty + if fg_inds.numel() == 0: + has_fg = False + fg_inds = paddle.ones([1], dtype='int32') + + inds_per_im = sampled_gt_inds[k] + inds_per_im = paddle.gather(inds_per_im, fg_inds) + + gt_segms_per_im = paddle.gather(gt_segms_per_im, inds_per_im) + + fg_rois = paddle.gather(rois_per_im, fg_inds) + fg_classes = paddle.gather(labels_per_im, fg_inds) + fg_segms = paddle.gather(gt_segms_per_im, fg_inds) + weight = paddle.ones([fg_rois.shape[0]], dtype='float32') + if not has_fg: + weight = weight - 1 + # remove padding + gt_polys = fg_segms.numpy() + boxes = fg_rois.numpy() + new_gt_polys = _strip_pad(gt_polys) + results = [ + rasterize_polygons_within_box(poly, box, resolution) + for poly, box in zip(new_gt_polys, boxes) + ] + tgt_mask = paddle.stack(results) + tgt_mask.stop_gradient = True + fg_rois.stop_gradient = True + + mask_index.append(fg_inds) + mask_rois.append(fg_rois) + mask_rois_num.append(paddle.shape(fg_rois)[0]) + tgt_classes.append(fg_classes) + tgt_masks.append(tgt_mask) + tgt_weights.append(weight) + + mask_index = paddle.concat(mask_index) + mask_rois_num = paddle.concat(mask_rois_num) + tgt_classes = paddle.concat(tgt_classes, axis=0) + tgt_masks = paddle.concat(tgt_masks, axis=0) + tgt_weights = paddle.concat(tgt_weights, axis=0) + + return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights + + +def libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected): + if len(pos_inds) <= num_expected: + return pos_inds + else: + unique_gt_inds = np.unique(max_classes[pos_inds]) + num_gts = len(unique_gt_inds) + num_per_gt = int(round(num_expected / float(num_gts)) + 1) + + sampled_inds = [] + for i in unique_gt_inds: + inds = np.nonzero(max_classes == i)[0] + before_len = len(inds) + inds = list(set(inds) & set(pos_inds)) + after_len = len(inds) + if len(inds) > num_per_gt: + inds = np.random.choice(inds, size=num_per_gt, replace=False) + sampled_inds.extend(list(inds)) # combine as a new sampler + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(set(pos_inds) - set(sampled_inds))) + assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \ + "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format( + len(sampled_inds), len(extra_inds), len(pos_inds)) + if len(extra_inds) > num_extra: + extra_inds = np.random.choice( + extra_inds, size=num_extra, replace=False) + sampled_inds.extend(extra_inds.tolist()) + elif len(sampled_inds) > num_expected: + sampled_inds = np.random.choice( + sampled_inds, size=num_expected, replace=False) + return paddle.to_tensor(sampled_inds) + + +def libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr, + num_bins, bg_thresh): + max_iou = max_overlaps.max() + iou_interval = (max_iou - floor_thr) / num_bins + per_num_expected = int(num_expected / num_bins) + + sampled_inds = [] + for i in range(num_bins): + start_iou = floor_thr + i * iou_interval + end_iou = floor_thr + (i + 1) * iou_interval + + tmp_set = set( + np.where( + np.logical_and(max_overlaps >= start_iou, max_overlaps < + end_iou))[0]) + tmp_inds = list(tmp_set & full_set) + + if len(tmp_inds) > per_num_expected: + tmp_sampled_set = np.random.choice( + tmp_inds, size=per_num_expected, replace=False) + else: + tmp_sampled_set = np.array(tmp_inds, dtype=np.int) + sampled_inds.append(tmp_sampled_set) + + sampled_inds = np.concatenate(sampled_inds) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(full_set - set(sampled_inds))) + assert len(sampled_inds) + len(extra_inds) == len(full_set), \ + "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format( + len(sampled_inds), len(extra_inds), len(full_set)) + + if len(extra_inds) > num_extra: + extra_inds = np.random.choice(extra_inds, num_extra, replace=False) + sampled_inds = np.concatenate([sampled_inds, extra_inds]) + + return sampled_inds + + +def libra_sample_neg(max_overlaps, + max_classes, + neg_inds, + num_expected, + floor_thr=-1, + floor_fraction=0, + num_bins=3, + bg_thresh=0.5): + if len(neg_inds) <= num_expected: + return neg_inds + else: + # balance sampling for negative samples + neg_set = set(neg_inds.tolist()) + if floor_thr > 0: + floor_set = set( + np.where( + np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr)) + [0]) + iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0]) + elif floor_thr == 0: + floor_set = set(np.where(max_overlaps == 0)[0]) + iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) + else: + floor_set = set() + iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) + floor_thr = 0 + + floor_neg_inds = list(floor_set & neg_set) + iou_sampling_neg_inds = list(iou_sampling_set & neg_set) + + num_expected_iou_sampling = int(num_expected * (1 - floor_fraction)) + if len(iou_sampling_neg_inds) > num_expected_iou_sampling: + if num_bins >= 2: + iou_sampled_inds = libra_sample_via_interval( + max_overlaps, + set(iou_sampling_neg_inds), num_expected_iou_sampling, + floor_thr, num_bins, bg_thresh) + else: + iou_sampled_inds = np.random.choice( + iou_sampling_neg_inds, + size=num_expected_iou_sampling, + replace=False) + else: + iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int) + num_expected_floor = num_expected - len(iou_sampled_inds) + if len(floor_neg_inds) > num_expected_floor: + sampled_floor_inds = np.random.choice( + floor_neg_inds, size=num_expected_floor, replace=False) + else: + sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int) + sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds)) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(neg_set - set(sampled_inds))) + if len(extra_inds) > num_extra: + extra_inds = np.random.choice( + extra_inds, size=num_extra, replace=False) + sampled_inds = np.concatenate((sampled_inds, extra_inds)) + return paddle.to_tensor(sampled_inds) + + +def libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap, + negative_overlap, num_classes): + # TODO: use paddle API to speed up + gt_classes = gt_classes.numpy() + gt_overlaps = np.zeros((anchors.shape[0], num_classes)) + matches = np.zeros((anchors.shape[0]), dtype=np.int32) + if len(gt_boxes) > 0: + proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy() + overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1) + overlaps_max = proposal_to_gt_overlaps.max(axis=1) + # Boxes which with non-zero overlap with gt boxes + overlapped_boxes_ind = np.where(overlaps_max > 0)[0] + overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[ + overlapped_boxes_ind]] + + for idx in range(len(overlapped_boxes_ind)): + gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[ + idx]] = overlaps_max[overlapped_boxes_ind[idx]] + matches[overlapped_boxes_ind[idx]] = overlaps_argmax[ + overlapped_boxes_ind[idx]] + + gt_overlaps = paddle.to_tensor(gt_overlaps) + matches = paddle.to_tensor(matches) + + matched_vals = paddle.max(gt_overlaps, axis=1) + match_labels = paddle.full(matches.shape, -1, dtype='int32') + match_labels = paddle.where(matched_vals < negative_overlap, + paddle.zeros_like(match_labels), match_labels) + match_labels = paddle.where(matched_vals >= positive_overlap, + paddle.ones_like(match_labels), match_labels) + + return matches, match_labels, matched_vals + + +def libra_sample_bbox(matches, + match_labels, + matched_vals, + gt_classes, + batch_size_per_im, + num_classes, + fg_fraction, + fg_thresh, + bg_thresh, + num_bins, + use_random=True, + is_cascade_rcnn=False): + rois_per_image = int(batch_size_per_im) + fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) + bg_rois_per_im = rois_per_image - fg_rois_per_im + + if is_cascade_rcnn: + fg_inds = paddle.nonzero(matched_vals >= fg_thresh) + bg_inds = paddle.nonzero(matched_vals < bg_thresh) + else: + matched_vals_np = matched_vals.numpy() + match_labels_np = match_labels.numpy() + + # sample fg + fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten() + fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0])) + if (fg_inds.shape[0] > fg_nums) and use_random: + fg_inds = libra_sample_pos(matched_vals_np, match_labels_np, + fg_inds.numpy(), fg_rois_per_im) + fg_inds = fg_inds[:fg_nums] + + # sample bg + bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten() + bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0])) + if (bg_inds.shape[0] > bg_nums) and use_random: + bg_inds = libra_sample_neg( + matched_vals_np, + match_labels_np, + bg_inds.numpy(), + bg_rois_per_im, + num_bins=num_bins, + bg_thresh=bg_thresh) + bg_inds = bg_inds[:bg_nums] + + sampled_inds = paddle.concat([fg_inds, bg_inds]) + + gt_classes = paddle.gather(gt_classes, matches) + gt_classes = paddle.where(match_labels == 0, + paddle.ones_like(gt_classes) * num_classes, + gt_classes) + gt_classes = paddle.where(match_labels == -1, + paddle.ones_like(gt_classes) * -1, gt_classes) + sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) + + return sampled_inds, sampled_gt_classes + + +def libra_generate_proposal_target(rpn_rois, + gt_classes, + gt_boxes, + batch_size_per_im, + fg_fraction, + fg_thresh, + bg_thresh, + num_classes, + use_random=True, + is_cascade_rcnn=False, + max_overlaps=None, + num_bins=3): + + rois_with_gt = [] + tgt_labels = [] + tgt_bboxes = [] + sampled_max_overlaps = [] + tgt_gt_inds = [] + new_rois_num = [] + + for i, rpn_roi in enumerate(rpn_rois): + max_overlap = max_overlaps[i] if is_cascade_rcnn else None + gt_bbox = gt_boxes[i] + gt_class = gt_classes[i] + if is_cascade_rcnn: + rpn_roi = filter_roi(rpn_roi, max_overlap) + bbox = paddle.concat([rpn_roi, gt_bbox]) + + # Step1: label bbox + matches, match_labels, matched_vals = libra_label_box( + bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes) + + # Step2: sample bbox + sampled_inds, sampled_gt_classes = libra_sample_bbox( + matches, match_labels, matched_vals, gt_class, batch_size_per_im, + num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins, + use_random, is_cascade_rcnn) + + # Step3: make output + rois_per_image = paddle.gather(bbox, sampled_inds) + sampled_gt_ind = paddle.gather(matches, sampled_inds) + sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) + sampled_overlap = paddle.gather(matched_vals, sampled_inds) + + rois_per_image.stop_gradient = True + sampled_gt_ind.stop_gradient = True + sampled_bbox.stop_gradient = True + sampled_overlap.stop_gradient = True + + tgt_labels.append(sampled_gt_classes) + tgt_bboxes.append(sampled_bbox) + rois_with_gt.append(rois_per_image) + sampled_max_overlaps.append(sampled_overlap) + tgt_gt_inds.append(sampled_gt_ind) + new_rois_num.append(paddle.shape(sampled_inds)[0]) + new_rois_num = paddle.concat(new_rois_num) + # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num + return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num diff --git a/ppdet/modeling/proposal_generator/target_layer.py b/ppdet/modeling/proposal_generator/target_layer.py new file mode 100644 index 0000000..cdf405e --- /dev/null +++ b/ppdet/modeling/proposal_generator/target_layer.py @@ -0,0 +1,418 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import paddle +from ppdet.core.workspace import register, serializable +from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target +from ppdet.modeling import bbox_utils +import numpy as np + + +@register +@serializable +class RPNTargetAssign(object): + """ + RPN targets assignment module + + The assignment consists of three steps: + 1. Match anchor and ground-truth box, label the anchor with foreground + or background sample + 2. Sample anchors to keep the properly ratio between foreground and + background + 3. Generate the targets for classification and regression branch + + + Args: + batch_size_per_im (int): Total number of RPN samples per image. + default 256 + fg_fraction (float): Fraction of anchors that is labeled + foreground, default 0.5 + positive_overlap (float): Minimum overlap required between an anchor + and ground-truth box for the (anchor, gt box) pair to be + a foreground sample. default 0.7 + negative_overlap (float): Maximum overlap allowed between an anchor + and ground-truth box for the (anchor, gt box) pair to be + a background sample. default 0.3 + use_random (bool): Use random sampling to choose foreground and + background boxes, default true. + """ + + def __init__(self, + batch_size_per_im=256, + fg_fraction=0.5, + positive_overlap=0.7, + negative_overlap=0.3, + use_random=True): + super(RPNTargetAssign, self).__init__() + self.batch_size_per_im = batch_size_per_im + self.fg_fraction = fg_fraction + self.positive_overlap = positive_overlap + self.negative_overlap = negative_overlap + self.use_random = use_random + + def __call__(self, inputs, anchors): + """ + inputs: ground-truth instances. + anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps. + """ + gt_boxes = inputs['gt_bbox'] + batch_size = gt_boxes.shape[0] + tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target( + anchors, gt_boxes, self.batch_size_per_im, self.positive_overlap, + self.negative_overlap, self.fg_fraction, self.use_random, + batch_size) + norm = self.batch_size_per_im * batch_size + + return tgt_labels, tgt_bboxes, tgt_deltas, norm + + +@register +class BBoxAssigner(object): + __shared__ = ['num_classes'] + """ + RCNN targets assignment module + + The assignment consists of three steps: + 1. Match RoIs and ground-truth box, label the RoIs with foreground + or background sample + 2. Sample anchors to keep the properly ratio between foreground and + background + 3. Generate the targets for classification and regression branch + + Args: + batch_size_per_im (int): Total number of RoIs per image. + default 512 + fg_fraction (float): Fraction of RoIs that is labeled + foreground, default 0.25 + fg_thresh (float): Minimum overlap required between a RoI + and ground-truth box for the (roi, gt box) pair to be + a foreground sample. default 0.5 + bg_thresh (float): Maximum overlap allowed between a RoI + and ground-truth box for the (roi, gt box) pair to be + a background sample. default 0.5 + use_random (bool): Use random sampling to choose foreground and + background boxes, default true + cascade_iou (list[iou]): The list of overlap to select foreground and + background of each stage, which is only used In Cascade RCNN. + num_classes (int): The number of class. + """ + + def __init__(self, + batch_size_per_im=512, + fg_fraction=.25, + fg_thresh=.5, + bg_thresh=.5, + use_random=True, + cascade_iou=[0.5, 0.6, 0.7], + num_classes=80): + super(BBoxAssigner, self).__init__() + self.batch_size_per_im = batch_size_per_im + self.fg_fraction = fg_fraction + self.fg_thresh = fg_thresh + self.bg_thresh = bg_thresh + self.use_random = use_random + self.cascade_iou = cascade_iou + self.num_classes = num_classes + + def __call__(self, + rpn_rois, + rpn_rois_num, + inputs, + stage=0, + is_cascade=False): + gt_classes = inputs['gt_class'] + gt_boxes = inputs['gt_bbox'] + # rois, tgt_labels, tgt_bboxes, tgt_gt_inds + # new_rois_num + outs = generate_proposal_target( + rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, + self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, + self.use_random, is_cascade, self.cascade_iou[stage]) + rois = outs[0] + rois_num = outs[-1] + # tgt_labels, tgt_bboxes, tgt_gt_inds + targets = outs[1:4] + return rois, rois_num, targets + + +@register +class BBoxLibraAssigner(object): + __shared__ = ['num_classes'] + """ + Libra-RCNN targets assignment module + + The assignment consists of three steps: + 1. Match RoIs and ground-truth box, label the RoIs with foreground + or background sample + 2. Sample anchors to keep the properly ratio between foreground and + background + 3. Generate the targets for classification and regression branch + + Args: + batch_size_per_im (int): Total number of RoIs per image. + default 512 + fg_fraction (float): Fraction of RoIs that is labeled + foreground, default 0.25 + fg_thresh (float): Minimum overlap required between a RoI + and ground-truth box for the (roi, gt box) pair to be + a foreground sample. default 0.5 + bg_thresh (float): Maximum overlap allowed between a RoI + and ground-truth box for the (roi, gt box) pair to be + a background sample. default 0.5 + use_random (bool): Use random sampling to choose foreground and + background boxes, default true + cascade_iou (list[iou]): The list of overlap to select foreground and + background of each stage, which is only used In Cascade RCNN. + num_classes (int): The number of class. + num_bins (int): The number of libra_sample. + """ + + def __init__(self, + batch_size_per_im=512, + fg_fraction=.25, + fg_thresh=.5, + bg_thresh=.5, + use_random=True, + cascade_iou=[0.5, 0.6, 0.7], + num_classes=80, + num_bins=3): + super(BBoxLibraAssigner, self).__init__() + self.batch_size_per_im = batch_size_per_im + self.fg_fraction = fg_fraction + self.fg_thresh = fg_thresh + self.bg_thresh = bg_thresh + self.use_random = use_random + self.cascade_iou = cascade_iou + self.num_classes = num_classes + self.num_bins = num_bins + + def __call__(self, + rpn_rois, + rpn_rois_num, + inputs, + stage=0, + is_cascade=False): + gt_classes = inputs['gt_class'] + gt_boxes = inputs['gt_bbox'] + # rois, tgt_labels, tgt_bboxes, tgt_gt_inds + outs = libra_generate_proposal_target( + rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, + self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, + self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins) + rois = outs[0] + rois_num = outs[-1] + # tgt_labels, tgt_bboxes, tgt_gt_inds + targets = outs[1:4] + return rois, rois_num, targets + + +@register +@serializable +class MaskAssigner(object): + __shared__ = ['num_classes', 'mask_resolution'] + """ + Mask targets assignment module + + The assignment consists of three steps: + 1. Select RoIs labels with foreground. + 2. Encode the RoIs and corresponding gt polygons to generate + mask target + + Args: + num_classes (int): The number of class + mask_resolution (int): The resolution of mask target, default 14 + """ + + def __init__(self, num_classes=80, mask_resolution=14): + super(MaskAssigner, self).__init__() + self.num_classes = num_classes + self.mask_resolution = mask_resolution + + def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs): + gt_segms = inputs['gt_poly'] + + outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds, + self.num_classes, self.mask_resolution) + + # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights + return outs + + +@register +class RBoxAssigner(object): + """ + assigner of rbox + Args: + pos_iou_thr (float): threshold of pos samples + neg_iou_thr (float): threshold of neg samples + min_iou_thr (float): the min threshold of samples + ignore_iof_thr (int): the ignored threshold + """ + + def __init__(self, + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_iou_thr=0.0, + ignore_iof_thr=-2): + super(RBoxAssigner, self).__init__() + + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_iou_thr = min_iou_thr + self.ignore_iof_thr = ignore_iof_thr + + def anchor_valid(self, anchors): + """ + + Args: + anchor: M x 4 + + Returns: + + """ + if anchors.ndim == 3: + anchors = anchors.reshape(-1, anchor.shape[-1]) + assert anchors.ndim == 2 + anchor_num = anchors.shape[0] + anchor_valid = np.ones((anchor_num), np.uint8) + anchor_inds = np.arange(anchor_num) + return anchor_inds + + def assign_anchor(self, + anchors, + gt_bboxes, + gt_lables, + pos_iou_thr, + neg_iou_thr, + min_iou_thr=0.0, + ignore_iof_thr=-2): + """ + + Args: + anchors: + gt_bboxes:[M, 5] rc,yc,w,h,angle + gt_lables: + + Returns: + + """ + assert anchors.shape[1] == 4 or anchors.shape[1] == 5 + assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5 + anchors_xc_yc = anchors + gt_bboxes_xc_yc = gt_bboxes + + # calc rbox iou + anchors_xc_yc = anchors_xc_yc.astype(np.float32) + gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32) + anchors_xc_yc = paddle.to_tensor(anchors_xc_yc, place=paddle.CPUPlace()) + gt_bboxes_xc_yc = paddle.to_tensor( + gt_bboxes_xc_yc, place=paddle.CPUPlace()) + + try: + from rbox_iou_ops import rbox_iou + except Exception as e: + print("import custom_ops error, try install rbox_iou_ops " \ + "following ppdet/ext_op/README.md", e) + sys.stdout.flush() + sys.exit(-1) + + iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc) + iou = iou.numpy() + iou = iou.T + + # every gt's anchor's index + gt_bbox_anchor_inds = iou.argmax(axis=0) + gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])] + gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0] + + # every anchor's gt bbox's index + anchor_gt_bbox_inds = iou.argmax(axis=1) + anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds] + + # (1) set labels=-2 as default + labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr + + # (2) assign ignore + labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr + + # (3) assign neg_ids -1 + assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr + assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr + assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2) + labels[assign_neg_ids] = -1 + + # anchor_gt_bbox_iou_inds + # (4) assign max_iou as pos_ids >=0 + anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds] + # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr) + labels[gt_bbox_anchor_iou_inds] = gt_lables[anchor_gt_bbox_iou_inds] + + # (5) assign >= pos_iou_thr as pos_ids + iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr + iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids] + labels[iou_pos_iou_thr_ids] = gt_lables[iou_pos_iou_thr_ids_box_inds] + return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels + + def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd): + + assert anchors.ndim == 2 + assert anchors.shape[1] == 5 + assert gt_bboxes.ndim == 2 + assert gt_bboxes.shape[1] == 5 + + pos_iou_thr = self.pos_iou_thr + neg_iou_thr = self.neg_iou_thr + min_iou_thr = self.min_iou_thr + ignore_iof_thr = self.ignore_iof_thr + + anchor_num = anchors.shape[0] + anchors_inds = self.anchor_valid(anchors) + anchors = anchors[anchors_inds] + gt_bboxes = gt_bboxes + is_crowd_slice = is_crowd + not_crowd_inds = np.where(is_crowd_slice == 0) + + # Step1: match anchor and gt_bbox + anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor( + anchors, gt_bboxes, + gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr, + ignore_iof_thr) + + # Step2: sample anchor + pos_inds = np.where(labels >= 0)[0] + neg_inds = np.where(labels == -1)[0] + + # Step3: make output + anchors_num = anchors.shape[0] + bbox_targets = np.zeros_like(anchors) + bbox_weights = np.zeros_like(anchors) + pos_labels = np.ones(anchors_num, dtype=np.int32) * -1 + pos_labels_weights = np.zeros(anchors_num, dtype=np.float32) + + pos_sampled_anchors = anchors[pos_inds] + #print('ancho target pos_inds', pos_inds, len(pos_inds)) + pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]] + if len(pos_inds) > 0: + pos_bbox_targets = bbox_utils.rbox2delta(pos_sampled_anchors, + pos_sampled_gt_boxes) + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + pos_labels[pos_inds] = labels[pos_inds] + pos_labels_weights[pos_inds] = 1.0 + + if len(neg_inds) > 0: + pos_labels_weights[neg_inds] = 1.0 + return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) diff --git a/ppdet/modeling/shape_spec.py b/ppdet/modeling/shape_spec.py new file mode 100644 index 0000000..a4d4a2f --- /dev/null +++ b/ppdet/modeling/shape_spec.py @@ -0,0 +1,33 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple + + +class ShapeSpec( + namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): + """ + A simple structure that contains basic shape specification about a tensor. + It is often used as the auxiliary inputs/outputs of models, + to complement the lack of shape inference ability among paddle modules. + Attributes: + channels: + height: + width: + stride: + """ + + def __new__(cls, channels=None, height=None, width=None, stride=None): + return super(ShapeSpec, cls).__new__(cls, channels, height, width, + stride) diff --git a/ppdet/modeling/tests/__init__.py b/ppdet/modeling/tests/__init__.py new file mode 100644 index 0000000..847ddc4 --- /dev/null +++ b/ppdet/modeling/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/modeling/tests/test_architectures.py b/ppdet/modeling/tests/test_architectures.py new file mode 100644 index 0000000..95cb212 --- /dev/null +++ b/ppdet/modeling/tests/test_architectures.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import ppdet + + +class TestFasterRCNN(unittest.TestCase): + def setUp(self): + self.set_config() + + def set_config(self): + self.cfg_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml' + + def test_trainer(self): + # Trainer __init__ will build model and DataLoader + # 'train' and 'eval' mode include dataset loading + # use 'test' mode to simplify tests + cfg = ppdet.core.workspace.load_config(self.cfg_file) + trainer = ppdet.engine.Trainer(cfg, mode='test') + + +class TestMaskRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml' + + +class TestCascadeRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml' + + +class TestYolov3(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml' + + +class TestSSD(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/ssd/ssd_vgg16_300_240e_voc.yml' + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/modeling/tests/test_base.py b/ppdet/modeling/tests/test_base.py new file mode 100644 index 0000000..cfd03fe --- /dev/null +++ b/ppdet/modeling/tests/test_base.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import contextlib +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.framework import Program +from paddle.fluid import core + + +class LayerTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.seed = 111 + + @classmethod + def tearDownClass(cls): + pass + + def _get_place(self, force_to_use_cpu=False): + # this option for ops that only have cpu kernel + if force_to_use_cpu: + return core.CPUPlace() + else: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + return core.CPUPlace() + + @contextlib.contextmanager + def static_graph(self): + paddle.enable_static() + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with fluid.program_guard(program): + paddle.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + yield + + def get_static_graph_result(self, + feed, + fetch_list, + with_lod=False, + force_to_use_cpu=False): + exe = fluid.Executor(self._get_place(force_to_use_cpu)) + exe.run(fluid.default_startup_program()) + return exe.run(fluid.default_main_program(), + feed=feed, + fetch_list=fetch_list, + return_numpy=(not with_lod)) + + @contextlib.contextmanager + def dynamic_graph(self, force_to_use_cpu=False): + paddle.disable_static() + with fluid.dygraph.guard( + self._get_place(force_to_use_cpu=force_to_use_cpu)): + paddle.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + yield diff --git a/ppdet/modeling/tests/test_ops.py b/ppdet/modeling/tests/test_ops.py new file mode 100644 index 0000000..bccebc2 --- /dev/null +++ b/ppdet/modeling/tests/test_ops.py @@ -0,0 +1,851 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.framework import Program, program_guard +from paddle.fluid.dygraph import base + +import ppdet.modeling.ops as ops +from ppdet.modeling.tests.test_base import LayerTest + + +def make_rois(h, w, rois_num, output_size): + rois = np.zeros((0, 4)).astype('float32') + for roi_num in rois_num: + roi = np.zeros((roi_num, 4)).astype('float32') + roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num) + roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num) + roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h) + roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w) + rois = np.vstack((rois, roi)) + return rois + + +def softmax(x): + # clip to shiftx, otherwise, when calc loss with + # log(exp(shiftx)), may get log(0)=INF + shiftx = (x - np.max(x)).clip(-64.) + exps = np.exp(shiftx) + return exps / np.sum(exps) + + +class TestCollectFpnProposals(LayerTest): + def test_collect_fpn_proposals(self): + multi_bboxes_np = [] + multi_scores_np = [] + rois_num_per_level_np = [] + for i in range(4): + bboxes_np = np.random.rand(5, 4).astype('float32') + scores_np = np.random.rand(5, 1).astype('float32') + rois_num = np.array([2, 3]).astype('int32') + multi_bboxes_np.append(bboxes_np) + multi_scores_np.append(scores_np) + rois_num_per_level_np.append(rois_num) + + with self.static_graph(): + multi_bboxes = [] + multi_scores = [] + rois_num_per_level = [] + for i in range(4): + bboxes = paddle.static.data( + name='rois' + str(i), + shape=[5, 4], + dtype='float32', + lod_level=1) + scores = paddle.static.data( + name='scores' + str(i), + shape=[5, 1], + dtype='float32', + lod_level=1) + rois_num = paddle.static.data( + name='rois_num' + str(i), shape=[None], dtype='int32') + + multi_bboxes.append(bboxes) + multi_scores.append(scores) + rois_num_per_level.append(rois_num) + + fpn_rois, rois_num = ops.collect_fpn_proposals( + multi_bboxes, + multi_scores, + 2, + 5, + 10, + rois_num_per_level=rois_num_per_level) + feed = {} + for i in range(4): + feed['rois' + str(i)] = multi_bboxes_np[i] + feed['scores' + str(i)] = multi_scores_np[i] + feed['rois_num' + str(i)] = rois_num_per_level_np[i] + fpn_rois_stat, rois_num_stat = self.get_static_graph_result( + feed=feed, fetch_list=[fpn_rois, rois_num], with_lod=True) + fpn_rois_stat = np.array(fpn_rois_stat) + rois_num_stat = np.array(rois_num_stat) + + with self.dynamic_graph(): + multi_bboxes_dy = [] + multi_scores_dy = [] + rois_num_per_level_dy = [] + for i in range(4): + bboxes_dy = base.to_variable(multi_bboxes_np[i]) + scores_dy = base.to_variable(multi_scores_np[i]) + rois_num_dy = base.to_variable(rois_num_per_level_np[i]) + multi_bboxes_dy.append(bboxes_dy) + multi_scores_dy.append(scores_dy) + rois_num_per_level_dy.append(rois_num_dy) + fpn_rois_dy, rois_num_dy = ops.collect_fpn_proposals( + multi_bboxes_dy, + multi_scores_dy, + 2, + 5, + 10, + rois_num_per_level=rois_num_per_level_dy) + fpn_rois_dy = fpn_rois_dy.numpy() + rois_num_dy = rois_num_dy.numpy() + + self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy)) + self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy)) + + def test_collect_fpn_proposals_error(self): + def generate_input(bbox_type, score_type, name): + multi_bboxes = [] + multi_scores = [] + for i in range(4): + bboxes = paddle.static.data( + name='rois' + name + str(i), + shape=[10, 4], + dtype=bbox_type, + lod_level=1) + scores = paddle.static.data( + name='scores' + name + str(i), + shape=[10, 1], + dtype=score_type, + lod_level=1) + multi_bboxes.append(bboxes) + multi_scores.append(scores) + return multi_bboxes, multi_scores + + with self.static_graph(): + bbox1 = paddle.static.data( + name='rois', shape=[5, 10, 4], dtype='float32', lod_level=1) + score1 = paddle.static.data( + name='scores', shape=[5, 10, 1], dtype='float32', lod_level=1) + bbox2, score2 = generate_input('int32', 'float32', '2') + self.assertRaises( + TypeError, + ops.collect_fpn_proposals, + multi_rois=bbox1, + multi_scores=score1, + min_level=2, + max_level=5, + post_nms_top_n=2000) + self.assertRaises( + TypeError, + ops.collect_fpn_proposals, + multi_rois=bbox2, + multi_scores=score2, + min_level=2, + max_level=5, + post_nms_top_n=2000) + + +class TestDistributeFpnProposals(LayerTest): + def test_distribute_fpn_proposals(self): + rois_np = np.random.rand(10, 4).astype('float32') + rois_num_np = np.array([4, 6]).astype('int32') + with self.static_graph(): + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + multi_rois, restore_ind, rois_num_per_level = ops.distribute_fpn_proposals( + fpn_rois=rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num) + fetch_list = multi_rois + [restore_ind] + rois_num_per_level + output_stat = self.get_static_graph_result( + feed={'rois': rois_np, + 'rois_num': rois_num_np}, + fetch_list=fetch_list, + with_lod=True) + output_stat_np = [] + for output in output_stat: + output_np = np.array(output) + if len(output_np) > 0: + output_stat_np.append(output_np) + + with self.dynamic_graph(): + rois_dy = base.to_variable(rois_np) + rois_num_dy = base.to_variable(rois_num_np) + multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = ops.distribute_fpn_proposals( + fpn_rois=rois_dy, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num_dy) + output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy + output_dy_np = [] + for output in output_dy: + output_np = output.numpy() + if len(output_np) > 0: + output_dy_np.append(output_np) + + for res_stat, res_dy in zip(output_stat_np, output_dy_np): + self.assertTrue(np.array_equal(res_stat, res_dy)) + + def test_distribute_fpn_proposals_error(self): + with self.static_graph(): + fpn_rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + ops.distribute_fpn_proposals, + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + + +class TestROIAlign(LayerTest): + def test_roi_align(self): + b, c, h, w = 2, 12, 20, 20 + inputs_np = np.random.rand(b, c, h, w).astype('float32') + rois_num = [4, 6] + output_size = (7, 7) + rois_np = make_rois(h, w, rois_num, output_size) + rois_num_np = np.array(rois_num).astype('int32') + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[b, c, h, w], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + + output = ops.roi_align( + input=inputs, + rois=rois, + output_size=output_size, + rois_num=rois_num) + output_np, = self.get_static_graph_result( + feed={ + 'inputs': inputs_np, + 'rois': rois_np, + 'rois_num': rois_num_np + }, + fetch_list=output, + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = base.to_variable(inputs_np) + rois_dy = base.to_variable(rois_np) + rois_num_dy = base.to_variable(rois_num_np) + + output_dy = ops.roi_align( + input=inputs_dy, + rois=rois_dy, + output_size=output_size, + rois_num=rois_num_dy) + output_dy_np = output_dy.numpy() + + self.assertTrue(np.array_equal(output_np, output_dy_np)) + + def test_roi_align_error(self): + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[2, 12, 20, 20], dtype='float32') + rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + ops.roi_align, + input=inputs, + rois=rois, + output_size=(7, 7)) + + +class TestROIPool(LayerTest): + def test_roi_pool(self): + b, c, h, w = 2, 12, 20, 20 + inputs_np = np.random.rand(b, c, h, w).astype('float32') + rois_num = [4, 6] + output_size = (7, 7) + rois_np = make_rois(h, w, rois_num, output_size) + rois_num_np = np.array(rois_num).astype('int32') + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[b, c, h, w], dtype='float32') + rois = paddle.static.data( + name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data( + name='rois_num', shape=[None], dtype='int32') + + output, _ = ops.roi_pool( + input=inputs, + rois=rois, + output_size=output_size, + rois_num=rois_num) + output_np, = self.get_static_graph_result( + feed={ + 'inputs': inputs_np, + 'rois': rois_np, + 'rois_num': rois_num_np + }, + fetch_list=[output], + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = base.to_variable(inputs_np) + rois_dy = base.to_variable(rois_np) + rois_num_dy = base.to_variable(rois_num_np) + + output_dy, _ = ops.roi_pool( + input=inputs_dy, + rois=rois_dy, + output_size=output_size, + rois_num=rois_num_dy) + output_dy_np = output_dy.numpy() + + self.assertTrue(np.array_equal(output_np, output_dy_np)) + + def test_roi_pool_error(self): + with self.static_graph(): + inputs = paddle.static.data( + name='inputs', shape=[2, 12, 20, 20], dtype='float32') + rois = paddle.static.data( + name='data_error', shape=[10, 4], dtype='int32', lod_level=1) + self.assertRaises( + TypeError, + ops.roi_pool, + input=inputs, + rois=rois, + output_size=(7, 7)) + + +class TestIoUSimilarity(LayerTest): + def test_iou_similarity(self): + b, c, h, w = 2, 12, 20, 20 + inputs_np = np.random.rand(b, c, h, w).astype('float32') + output_size = (7, 7) + x_np = make_rois(h, w, [20], output_size) + y_np = make_rois(h, w, [10], output_size) + with self.static_graph(): + x = paddle.static.data(name='x', shape=[20, 4], dtype='float32') + y = paddle.static.data(name='y', shape=[10, 4], dtype='float32') + + iou = ops.iou_similarity(x=x, y=y) + iou_np, = self.get_static_graph_result( + feed={ + 'x': x_np, + 'y': y_np, + }, fetch_list=[iou], with_lod=False) + + with self.dynamic_graph(): + x_dy = base.to_variable(x_np) + y_dy = base.to_variable(y_np) + + iou_dy = ops.iou_similarity(x=x_dy, y=y_dy) + iou_dy_np = iou_dy.numpy() + + self.assertTrue(np.array_equal(iou_np, iou_dy_np)) + + +class TestBipartiteMatch(LayerTest): + def test_bipartite_match(self): + distance = np.random.random((20, 10)).astype('float32') + with self.static_graph(): + x = paddle.static.data(name='x', shape=[20, 10], dtype='float32') + + match_indices, match_dist = ops.bipartite_match( + x, match_type='per_prediction', dist_threshold=0.5) + match_indices_np, match_dist_np = self.get_static_graph_result( + feed={'x': distance, }, + fetch_list=[match_indices, match_dist], + with_lod=False) + + with self.dynamic_graph(): + x_dy = base.to_variable(distance) + + match_indices_dy, match_dist_dy = ops.bipartite_match( + x_dy, match_type='per_prediction', dist_threshold=0.5) + match_indices_dy_np = match_indices_dy.numpy() + match_dist_dy_np = match_dist_dy.numpy() + + self.assertTrue(np.array_equal(match_indices_np, match_indices_dy_np)) + self.assertTrue(np.array_equal(match_dist_np, match_dist_dy_np)) + + +class TestYoloBox(LayerTest): + def test_yolo_box(self): + + # x shape [N C H W], C=K * (5 + class_num), class_num=10, K=2 + np_x = np.random.random([1, 30, 7, 7]).astype('float32') + np_origin_shape = np.array([[608, 608]], dtype='int32') + class_num = 10 + conf_thresh = 0.01 + downsample_ratio = 32 + scale_x_y = 1.2 + + # static + with self.static_graph(): + # x shape [N C H W], C=K * (5 + class_num), class_num=10, K=2 + x = paddle.static.data( + name='x', shape=[1, 30, 7, 7], dtype='float32') + origin_shape = paddle.static.data( + name='origin_shape', shape=[1, 2], dtype='int32') + + boxes, scores = ops.yolo_box( + x, + origin_shape, [10, 13, 30, 13], + class_num, + conf_thresh, + downsample_ratio, + scale_x_y=scale_x_y) + + boxes_np, scores_np = self.get_static_graph_result( + feed={ + 'x': np_x, + 'origin_shape': np_origin_shape, + }, + fetch_list=[boxes, scores], + with_lod=False) + + # dygraph + with self.dynamic_graph(): + x_dy = fluid.layers.assign(np_x) + origin_shape_dy = fluid.layers.assign(np_origin_shape) + + boxes_dy, scores_dy = ops.yolo_box( + x_dy, + origin_shape_dy, [10, 13, 30, 13], + 10, + 0.01, + 32, + scale_x_y=scale_x_y) + + boxes_dy_np = boxes_dy.numpy() + scores_dy_np = scores_dy.numpy() + + self.assertTrue(np.array_equal(boxes_np, boxes_dy_np)) + self.assertTrue(np.array_equal(scores_np, scores_dy_np)) + + def test_yolo_box_error(self): + with self.static_graph(): + # x shape [N C H W], C=K * (5 + class_num), class_num=10, K=2 + x = paddle.static.data( + name='x', shape=[1, 30, 7, 7], dtype='float32') + origin_shape = paddle.static.data( + name='origin_shape', shape=[1, 2], dtype='int32') + + self.assertRaises( + TypeError, + ops.yolo_box, + x, + origin_shape, [10, 13, 30, 13], + 10.123, + 0.01, + 32, + scale_x_y=1.2) + + +class TestPriorBox(LayerTest): + def test_prior_box(self): + input_np = np.random.rand(2, 10, 32, 32).astype('float32') + image_np = np.random.rand(2, 10, 40, 40).astype('float32') + min_sizes = [2, 4] + with self.static_graph(): + input = paddle.static.data( + name='input', shape=[2, 10, 32, 32], dtype='float32') + image = paddle.static.data( + name='image', shape=[2, 10, 40, 40], dtype='float32') + + box, var = ops.prior_box( + input=input, + image=image, + min_sizes=min_sizes, + clip=True, + flip=True) + box_np, var_np = self.get_static_graph_result( + feed={ + 'input': input_np, + 'image': image_np, + }, + fetch_list=[box, var], + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = base.to_variable(input_np) + image_dy = base.to_variable(image_np) + + box_dy, var_dy = ops.prior_box( + input=inputs_dy, + image=image_dy, + min_sizes=min_sizes, + clip=True, + flip=True) + box_dy_np = box_dy.numpy() + var_dy_np = var_dy.numpy() + + self.assertTrue(np.array_equal(box_np, box_dy_np)) + self.assertTrue(np.array_equal(var_np, var_dy_np)) + + def test_prior_box_error(self): + with self.static_graph(): + input = paddle.static.data( + name='input', shape=[2, 10, 32, 32], dtype='int32') + image = paddle.static.data( + name='image', shape=[2, 10, 40, 40], dtype='int32') + self.assertRaises( + TypeError, + ops.prior_box, + input=input, + image=image, + min_sizes=[2, 4], + clip=True, + flip=True) + + +class TestAnchorGenerator(LayerTest): + def test_anchor_generator(self): + b, c, h, w = 2, 48, 16, 16 + input_np = np.random.rand(2, 48, 16, 16).astype('float32') + with self.static_graph(): + input = paddle.static.data( + name='input', shape=[b, c, h, w], dtype='float32') + + anchor, var = ops.anchor_generator( + input=input, + anchor_sizes=[64, 128, 256, 512], + aspect_ratios=[0.5, 1.0, 2.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + anchor_np, var_np = self.get_static_graph_result( + feed={'input': input_np, }, + fetch_list=[anchor, var], + with_lod=False) + + with self.dynamic_graph(): + inputs_dy = base.to_variable(input_np) + + anchor_dy, var_dy = ops.anchor_generator( + input=inputs_dy, + anchor_sizes=[64, 128, 256, 512], + aspect_ratios=[0.5, 1.0, 2.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + anchor_dy_np = anchor_dy.numpy() + var_dy_np = var_dy.numpy() + + self.assertTrue(np.array_equal(anchor_np, anchor_dy_np)) + self.assertTrue(np.array_equal(var_np, var_dy_np)) + + +class TestMulticlassNms(LayerTest): + def test_multiclass_nms(self): + boxes_np = np.random.rand(81, 4).astype('float32') + scores_np = np.random.rand(81).astype('float32') + rois_num_np = np.array([40, 41]).astype('int32') + with self.static_graph(): + boxes = paddle.static.data( + name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) + scores = paddle.static.data( + name='scores', shape=[81], dtype='float32', lod_level=1) + rois_num = paddle.static.data( + name='rois_num', shape=[40, 41], dtype='int32') + + output = ops.multiclass_nms( + bboxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num) + out_np, index_np, nms_rois_num_np = self.get_static_graph_result( + feed={ + 'bboxes': boxes_np, + 'scores': scores_np, + 'rois_num': rois_num_np + }, + fetch_list=output, + with_lod=False) + + with self.dynamic_graph(): + boxes_dy = base.to_variable(boxes_np) + scores_dy = base.to_variable(scores_np) + rois_num_dy = base.to_variable(rois_num_np) + + out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms( + bboxes=boxes_dy, + scores=scores_dy, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num_dy) + out_dy_np = out_dy.numpy() + index_dy_np = index_dy.numpy() + nms_rois_num_dy_np = nms_rois_num_dy.numpy() + + self.assertTrue(np.array_equal(out_np, out_dy_np)) + self.assertTrue(np.array_equal(index_np, index_dy_np)) + self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np)) + + def test_multiclass_nms_error(self): + with self.static_graph(): + boxes = paddle.static.data( + name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) + scores = paddle.static.data( + name='scores', shape=[81], dtype='float32', lod_level=1) + rois_num = paddle.static.data( + name='rois_num', shape=[40, 41], dtype='int32') + self.assertRaises( + TypeError, + ops.multiclass_nms, + boxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True, + rois_num=rois_num) + + +class TestMatrixNMS(LayerTest): + def test_matrix_nms(self): + N, M, C = 7, 1200, 21 + BOX_SIZE = 4 + nms_top_k = 400 + keep_top_k = 200 + score_threshold = 0.01 + post_threshold = 0. + + scores_np = np.random.random((N * M, C)).astype('float32') + scores_np = np.apply_along_axis(softmax, 1, scores_np) + scores_np = np.reshape(scores_np, (N, M, C)) + scores_np = np.transpose(scores_np, (0, 2, 1)) + + boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32') + boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5 + boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5 + + with self.static_graph(): + boxes = paddle.static.data( + name='boxes', shape=[N, M, BOX_SIZE], dtype='float32') + scores = paddle.static.data( + name='scores', shape=[N, C, M], dtype='float32') + out, index, _ = ops.matrix_nms( + bboxes=boxes, + scores=scores, + score_threshold=score_threshold, + post_threshold=post_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + return_index=True) + out_np, index_np = self.get_static_graph_result( + feed={'boxes': boxes_np, + 'scores': scores_np}, + fetch_list=[out, index], + with_lod=True) + + with self.dynamic_graph(): + boxes_dy = base.to_variable(boxes_np) + scores_dy = base.to_variable(scores_np) + + out_dy, index_dy, _ = ops.matrix_nms( + bboxes=boxes_dy, + scores=scores_dy, + score_threshold=score_threshold, + post_threshold=post_threshold, + nms_top_k=nms_top_k, + keep_top_k=keep_top_k, + return_index=True) + out_dy_np = out_dy.numpy() + index_dy_np = index_dy.numpy() + + self.assertTrue(np.array_equal(out_np, out_dy_np)) + self.assertTrue(np.array_equal(index_np, index_dy_np)) + + def test_matrix_nms_error(self): + with self.static_graph(): + bboxes = paddle.static.data( + name='bboxes', shape=[7, 1200, 4], dtype='float32') + scores = paddle.static.data( + name='data_error', shape=[7, 21, 1200], dtype='int32') + self.assertRaises( + TypeError, + ops.matrix_nms, + bboxes=bboxes, + scores=scores, + score_threshold=0.01, + post_threshold=0., + nms_top_k=400, + keep_top_k=200, + return_index=True) + + +class TestBoxCoder(LayerTest): + def test_box_coder(self): + + prior_box_np = np.random.random((81, 4)).astype('float32') + prior_box_var_np = np.random.random((81, 4)).astype('float32') + target_box_np = np.random.random((20, 81, 4)).astype('float32') + + # static + with self.static_graph(): + prior_box = paddle.static.data( + name='prior_box', shape=[81, 4], dtype='float32') + prior_box_var = paddle.static.data( + name='prior_box_var', shape=[81, 4], dtype='float32') + target_box = paddle.static.data( + name='target_box', shape=[20, 81, 4], dtype='float32') + + boxes = ops.box_coder( + prior_box=prior_box, + prior_box_var=prior_box_var, + target_box=target_box, + code_type="decode_center_size", + box_normalized=False) + + boxes_np, = self.get_static_graph_result( + feed={ + 'prior_box': prior_box_np, + 'prior_box_var': prior_box_var_np, + 'target_box': target_box_np, + }, + fetch_list=[boxes], + with_lod=False) + + # dygraph + with self.dynamic_graph(): + prior_box_dy = base.to_variable(prior_box_np) + prior_box_var_dy = base.to_variable(prior_box_var_np) + target_box_dy = base.to_variable(target_box_np) + + boxes_dy = ops.box_coder( + prior_box=prior_box_dy, + prior_box_var=prior_box_var_dy, + target_box=target_box_dy, + code_type="decode_center_size", + box_normalized=False) + + boxes_dy_np = boxes_dy.numpy() + + self.assertTrue(np.array_equal(boxes_np, boxes_dy_np)) + + def test_box_coder_error(self): + with self.static_graph(): + prior_box = paddle.static.data( + name='prior_box', shape=[81, 4], dtype='int32') + prior_box_var = paddle.static.data( + name='prior_box_var', shape=[81, 4], dtype='float32') + target_box = paddle.static.data( + name='target_box', shape=[20, 81, 4], dtype='float32') + + self.assertRaises(TypeError, ops.box_coder, prior_box, + prior_box_var, target_box) + + +class TestGenerateProposals(LayerTest): + def test_generate_proposals(self): + scores_np = np.random.rand(2, 3, 4, 4).astype('float32') + bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32') + im_shape_np = np.array([[8, 8], [6, 6]]).astype('float32') + anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4), + [4, 4, 3, 4]).astype('float32') + variances_np = np.ones((4, 4, 3, 4)).astype('float32') + + with self.static_graph(): + scores = paddle.static.data( + name='scores', shape=[2, 3, 4, 4], dtype='float32') + bbox_deltas = paddle.static.data( + name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32') + im_shape = paddle.static.data( + name='im_shape', shape=[2, 2], dtype='float32') + anchors = paddle.static.data( + name='anchors', shape=[4, 4, 3, 4], dtype='float32') + variances = paddle.static.data( + name='var', shape=[4, 4, 3, 4], dtype='float32') + rois, roi_probs, rois_num = ops.generate_proposals( + scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=10, + post_nms_top_n=5, + return_rois_num=True) + rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result( + feed={ + 'scores': scores_np, + 'bbox_deltas': bbox_deltas_np, + 'im_shape': im_shape_np, + 'anchors': anchors_np, + 'var': variances_np + }, + fetch_list=[rois, roi_probs, rois_num], + with_lod=True) + + with self.dynamic_graph(): + scores_dy = base.to_variable(scores_np) + bbox_deltas_dy = base.to_variable(bbox_deltas_np) + im_shape_dy = base.to_variable(im_shape_np) + anchors_dy = base.to_variable(anchors_np) + variances_dy = base.to_variable(variances_np) + rois, roi_probs, rois_num = ops.generate_proposals( + scores_dy, + bbox_deltas_dy, + im_shape_dy, + anchors_dy, + variances_dy, + pre_nms_top_n=10, + post_nms_top_n=5, + return_rois_num=True) + rois_dy = rois.numpy() + roi_probs_dy = roi_probs.numpy() + rois_num_dy = rois_num.numpy() + + self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy)) + self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy)) + self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy)) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/modeling/tests/test_transfrom.py b/ppdet/modeling/tests/test_transfrom.py new file mode 100644 index 0000000..b987104 --- /dev/null +++ b/ppdet/modeling/tests/test_transfrom.py @@ -0,0 +1,240 @@ +from __future__ import print_function +import random +import unittest +import numpy as np +import copy +# add python path of PadleDetection to sys.path +import os +import sys +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.data.transform import * + + +def gen_sample(h, w, nt, nc, random_score=True, channel_first=False): + im = np.random.randint(0, 256, size=(h, w, 3)).astype('float32') + if channel_first: + im = im.transpose((2, 0, 1)) + gt_bbox = np.random.random(size=(nt, 4)).astype('float32') + gt_class = np.random.randint(0, nc, size=(nt, 1)).astype('int32') + if random_score: + gt_score = np.random.random(size=(nt, 1)) + else: + gt_score = np.ones(shape=(nt, 1)).astype('float32') + is_crowd = np.zeros_like(gt_class) + sample = { + 'image': im, + 'gt_bbox': gt_bbox, + 'gt_class': gt_class, + 'gt_score': gt_score, + 'is_crowd': is_crowd, + 'h': h, + 'w': w + } + return sample + + +class TestTransformOp(unittest.TestCase): + def setUp(self): + self.h, self.w = np.random.randint(1, 1024, size=2) + self.nt = np.random.randint(1, 50) + self.nc = 80 + + def assertAllClose(self, x, y, msg, atol=1e-5, rtol=1e-3): + self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) + + +class TestResizeOp(TestTransformOp): + def test_resize(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = Resize(target_dim=608, interp=2) + curr_op = ResizeOp(target_size=608, keep_ratio=False, interp=2) + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +# only for specified random seed +# class TestMixupOp(TestTransformOp): +# def setUp(self): +# self.h, self.w = np.random.randint(1024, size=2) +# self.nt = np.random.randint(50) +# self.nc = 80 + +# def test_mixup(self): +# curr_sample = [gen_sample(self.h, self.w, self.nt, self.nc) for _ in range(2)] +# orig_sample = copy.deepcopy(curr_sample[0]) +# orig_sample['mixup'] = copy.deepcopy(curr_sample[1]) +# orig_op = MixupImage(alpha=1.5, beta=1.5) +# curr_op = MixupOp(alpha=1.5, beta=1.5) +# orig_res = orig_op(orig_sample) +# curr_res = curr_op(curr_sample) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + +# only for specified random seed +# class TestRandomDistortOp(TestTransformOp): + +# def test_random_distort(self): +# sample = gen_sample(self.h, self.w, self.nt, self.nc) +# orig_op = ColorDistort(hsv_format=True, random_apply=False) +# curr_op = RandomDistortOp(random_apply=False) +# orig_res = orig_op(copy.deepcopy(sample)) +# curr_res = curr_op(copy.deepcopy(sample)) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + +# only for specified random seed +# class TestRandomExpandOp(TestTransformOp): + +# def test_random_expand(self): +# sample = gen_sample(self.h, self.w, self.nt, self.nc) +# orig_op = RandomExpand(fill_value=[123.675, 116.28, 103.53]) +# curr_op = RandomExpandOp(fill_value=[123.675, 116.28, 103.53]) +# orig_res = orig_op(copy.deepcopy(sample)) +# curr_res = curr_op(copy.deepcopy(sample)) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + +# only for specified random seed +# class TestRandomCropOp(TestTransformOp): + +# def test_random_crop(self): +# sample = gen_sample(self.h, self.w, self.nt, self.nc) +# orig_op = RandomCrop() +# curr_op = RandomCropOp() +# orig_res = orig_op(copy.deepcopy(sample)) +# curr_res = curr_op(copy.deepcopy(sample)) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + +# only for specified random seed +# class TestRandomFlipOp(TestTransformOp): + +# def test_random_flip(self): +# sample = gen_sample(self.h, self.w, self.nt, self.nc) +# orig_op = RandomFlipImage(is_normalized=False) +# curr_op = RandomFlipOp() +# orig_res = orig_op(copy.deepcopy(sample)) +# curr_res = curr_op(copy.deepcopy(sample)) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + +# only for specified random seed +# class TestBatchRandomResizeOp(TestTransformOp): + +# def test_batch_random_resize(self): +# sample = [gen_sample(self.h, self.w, self.nt, self.nc) for _ in range(10)] +# orig_op = RandomShape(sizes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_inter=True, resize_box=True) +# curr_op = BatchRandomResizeOp(target_size=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], random_size=True, random_interp=True, keep_ratio=False) +# orig_ress = orig_op(copy.deepcopy(sample)) +# curr_ress = curr_op(copy.deepcopy(sample)) +# fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] +# for orig_res, curr_res in zip(orig_ress, curr_ress): +# for k in fields: +# self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestNormalizeBoxOp(TestTransformOp): + def test_normalize_box(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = NormalizeBox() + curr_op = NormalizeBoxOp() + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestPadBoxOp(TestTransformOp): + def test_pad_box(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = PadBox(num_max_boxes=50) + curr_op = PadBoxOp(num_max_boxes=50) + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestBboxXYXY2XYWHOp(TestTransformOp): + def test_bbox_xyxy2xywh(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = BboxXYXY2XYWH() + curr_op = BboxXYXY2XYWHOp() + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestNormalizeImageOp(TestTransformOp): + def test_normalize_image(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = NormalizeImage( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False) + curr_op = NormalizeImageOp( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True) + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestPermuteOp(TestTransformOp): + def test_permute(self): + sample = gen_sample(self.h, self.w, self.nt, self.nc) + orig_op = Permute(to_bgr=False, channel_first=True) + curr_op = PermuteOp() + orig_res = orig_op(copy.deepcopy(sample)) + curr_res = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +class TestGt2YoloTargetOp(TestTransformOp): + def test_gt2yolotarget(self): + sample = [ + gen_sample( + self.h, self.w, self.nt, self.nc, channel_first=True) + for _ in range(10) + ] + orig_op = Gt2YoloTarget( + anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], + anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]], + downsample_ratios=[32, 16, 8]) + curr_op = Gt2YoloTargetOp( + anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], + anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]], + downsample_ratios=[32, 16, 8]) + orig_ress = orig_op(copy.deepcopy(sample)) + curr_ress = curr_op(copy.deepcopy(sample)) + fields = ['image', 'gt_bbox', 'gt_class', 'gt_score'] + for orig_res, curr_res in zip(orig_ress, curr_ress): + for k in fields: + self.assertAllClose(orig_res[k], curr_res[k], msg=k) + + +if __name__ == "__main__": + unittest.main() diff --git a/ppdet/modeling/tests/test_yolov3_loss.py b/ppdet/modeling/tests/test_yolov3_loss.py new file mode 100644 index 0000000..885ae73 --- /dev/null +++ b/ppdet/modeling/tests/test_yolov3_loss.py @@ -0,0 +1,419 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import unittest +import numpy as np +from scipy.special import logit +from scipy.special import expit + +import paddle +from paddle import fluid +from paddle.fluid import core +# add python path of PadleDetection to sys.path +import os +import sys +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.modeling.loss import YOLOv3Loss +from ppdet.data.transform.op_helper import jaccard_overlap +import random +import numpy as np + + +def _split_ioup(output, an_num, num_classes): + """ + Split output feature map to output, predicted iou + along channel dimension + """ + ioup = fluid.layers.slice(output, axes=[1], starts=[0], ends=[an_num]) + ioup = fluid.layers.sigmoid(ioup) + oriout = fluid.layers.slice( + output, axes=[1], starts=[an_num], ends=[an_num * (num_classes + 6)]) + return (ioup, oriout) + + +def _split_output(output, an_num, num_classes): + """ + Split output feature map to x, y, w, h, objectness, classification + along channel dimension + """ + x = fluid.layers.strided_slice( + output, + axes=[1], + starts=[0], + ends=[output.shape[1]], + strides=[5 + num_classes]) + y = fluid.layers.strided_slice( + output, + axes=[1], + starts=[1], + ends=[output.shape[1]], + strides=[5 + num_classes]) + w = fluid.layers.strided_slice( + output, + axes=[1], + starts=[2], + ends=[output.shape[1]], + strides=[5 + num_classes]) + h = fluid.layers.strided_slice( + output, + axes=[1], + starts=[3], + ends=[output.shape[1]], + strides=[5 + num_classes]) + obj = fluid.layers.strided_slice( + output, + axes=[1], + starts=[4], + ends=[output.shape[1]], + strides=[5 + num_classes]) + clss = [] + stride = output.shape[1] // an_num + for m in range(an_num): + clss.append( + fluid.layers.slice( + output, + axes=[1], + starts=[stride * m + 5], + ends=[stride * m + 5 + num_classes])) + cls = fluid.layers.transpose( + fluid.layers.stack( + clss, axis=1), perm=[0, 1, 3, 4, 2]) + return (x, y, w, h, obj, cls) + + +def _split_target(target): + """ + split target to x, y, w, h, objectness, classification + along dimension 2 + target is in shape [N, an_num, 6 + class_num, H, W] + """ + tx = target[:, :, 0, :, :] + ty = target[:, :, 1, :, :] + tw = target[:, :, 2, :, :] + th = target[:, :, 3, :, :] + tscale = target[:, :, 4, :, :] + tobj = target[:, :, 5, :, :] + tcls = fluid.layers.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2]) + tcls.stop_gradient = True + return (tx, ty, tw, th, tscale, tobj, tcls) + + +def _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes, + downsample, ignore_thresh, scale_x_y): + # A prediction bbox overlap any gt_bbox over ignore_thresh, + # objectness loss will be ignored, process as follows: + # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here + # NOTE: img_size is set as 1.0 to get noramlized pred bbox + bbox, prob = fluid.layers.yolo_box( + x=output, + img_size=fluid.layers.ones( + shape=[batch_size, 2], dtype="int32"), + anchors=anchors, + class_num=num_classes, + conf_thresh=0., + downsample_ratio=downsample, + clip_bbox=False, + scale_x_y=scale_x_y) + # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox + # and gt bbox in each sample + if batch_size > 1: + preds = fluid.layers.split(bbox, batch_size, dim=0) + gts = fluid.layers.split(gt_box, batch_size, dim=0) + else: + preds = [bbox] + gts = [gt_box] + probs = [prob] + ious = [] + for pred, gt in zip(preds, gts): + + def box_xywh2xyxy(box): + x = box[:, 0] + y = box[:, 1] + w = box[:, 2] + h = box[:, 3] + return fluid.layers.stack( + [ + x - w / 2., + y - h / 2., + x + w / 2., + y + h / 2., + ], axis=1) + + pred = fluid.layers.squeeze(pred, axes=[0]) + gt = box_xywh2xyxy(fluid.layers.squeeze(gt, axes=[0])) + ious.append(fluid.layers.iou_similarity(pred, gt)) + iou = fluid.layers.stack(ious, axis=0) + # 3. Get iou_mask by IoU between gt bbox and prediction bbox, + # Get obj_mask by tobj(holds gt_score), calculate objectness loss + max_iou = fluid.layers.reduce_max(iou, dim=-1) + iou_mask = fluid.layers.cast(max_iou <= ignore_thresh, dtype="float32") + output_shape = fluid.layers.shape(output) + an_num = len(anchors) // 2 + iou_mask = fluid.layers.reshape(iou_mask, (-1, an_num, output_shape[2], + output_shape[3])) + iou_mask.stop_gradient = True + # NOTE: tobj holds gt_score, obj_mask holds object existence mask + obj_mask = fluid.layers.cast(tobj > 0., dtype="float32") + obj_mask.stop_gradient = True + # For positive objectness grids, objectness loss should be calculated + # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0 + loss_obj = fluid.layers.sigmoid_cross_entropy_with_logits(obj, obj_mask) + loss_obj_pos = fluid.layers.reduce_sum(loss_obj * tobj, dim=[1, 2, 3]) + loss_obj_neg = fluid.layers.reduce_sum( + loss_obj * (1.0 - obj_mask) * iou_mask, dim=[1, 2, 3]) + return loss_obj_pos, loss_obj_neg + + +def fine_grained_loss(output, + target, + gt_box, + batch_size, + num_classes, + anchors, + ignore_thresh, + downsample, + scale_x_y=1., + eps=1e-10): + an_num = len(anchors) // 2 + x, y, w, h, obj, cls = _split_output(output, an_num, num_classes) + tx, ty, tw, th, tscale, tobj, tcls = _split_target(target) + + tscale_tobj = tscale * tobj + + scale_x_y = scale_x_y + + if (abs(scale_x_y - 1.0) < eps): + loss_x = fluid.layers.sigmoid_cross_entropy_with_logits( + x, tx) * tscale_tobj + loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) + loss_y = fluid.layers.sigmoid_cross_entropy_with_logits( + y, ty) * tscale_tobj + loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) + else: + dx = scale_x_y * fluid.layers.sigmoid(x) - 0.5 * (scale_x_y - 1.0) + dy = scale_x_y * fluid.layers.sigmoid(y) - 0.5 * (scale_x_y - 1.0) + loss_x = fluid.layers.abs(dx - tx) * tscale_tobj + loss_x = fluid.layers.reduce_sum(loss_x, dim=[1, 2, 3]) + loss_y = fluid.layers.abs(dy - ty) * tscale_tobj + loss_y = fluid.layers.reduce_sum(loss_y, dim=[1, 2, 3]) + + # NOTE: we refined loss function of (w, h) as L1Loss + loss_w = fluid.layers.abs(w - tw) * tscale_tobj + loss_w = fluid.layers.reduce_sum(loss_w, dim=[1, 2, 3]) + loss_h = fluid.layers.abs(h - th) * tscale_tobj + loss_h = fluid.layers.reduce_sum(loss_h, dim=[1, 2, 3]) + + loss_obj_pos, loss_obj_neg = _calc_obj_loss( + output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample, + ignore_thresh, scale_x_y) + + loss_cls = fluid.layers.sigmoid_cross_entropy_with_logits(cls, tcls) + loss_cls = fluid.layers.elementwise_mul(loss_cls, tobj, axis=0) + loss_cls = fluid.layers.reduce_sum(loss_cls, dim=[1, 2, 3, 4]) + + loss_xys = fluid.layers.reduce_mean(loss_x + loss_y) + loss_whs = fluid.layers.reduce_mean(loss_w + loss_h) + loss_objs = fluid.layers.reduce_mean(loss_obj_pos + loss_obj_neg) + loss_clss = fluid.layers.reduce_mean(loss_cls) + + losses_all = { + "loss_xy": fluid.layers.sum(loss_xys), + "loss_wh": fluid.layers.sum(loss_whs), + "loss_loc": fluid.layers.sum(loss_xys) + fluid.layers.sum(loss_whs), + "loss_obj": fluid.layers.sum(loss_objs), + "loss_cls": fluid.layers.sum(loss_clss), + } + return losses_all, x, y, tx, ty + + +def gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size, + stride): + grid_h, grid_w = size + h, w = grid_h * stride, grid_w * stride + an_hw = np.array(anchors) / np.array([[w, h]]) + target = np.zeros( + (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32) + for b in range(gt_bbox.shape[0]): + gx, gy, gw, gh = gt_bbox[b, :] + cls = gt_class[b] + score = gt_score[b] + if gw <= 0. or gh <= 0. or score <= 0.: + continue + + # find best match anchor index + best_iou = 0. + best_idx = -1 + for an_idx in range(an_hw.shape[0]): + iou = jaccard_overlap([0., 0., gw, gh], + [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) + if iou > best_iou: + best_iou = iou + best_idx = an_idx + + gi = int(gx * grid_w) + gj = int(gy * grid_h) + + # gtbox should be regresed in this layes if best match + # anchor index in anchor mask of this layer + if best_idx in mask: + best_n = mask.index(best_idx) + + # x, y, w, h, scale + target[best_n, 0, gj, gi] = gx * grid_w - gi + target[best_n, 1, gj, gi] = gy * grid_h - gj + target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0]) + target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1]) + target[best_n, 4, gj, gi] = 2.0 - gw * gh + + # objectness record gt_score + # if target[best_n, 5, gj, gi] > 0: + # print('find 1 duplicate') + target[best_n, 5, gj, gi] = score + + # classification + target[best_n, 6 + cls, gj, gi] = 1. + + return target + + +class TestYolov3LossOp(unittest.TestCase): + def setUp(self): + self.initTestCase() + x = np.random.uniform(0, 1, self.x_shape).astype('float64') + gtbox = np.random.random(size=self.gtbox_shape).astype('float64') + gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2]) + gtmask = np.random.randint(0, 2, self.gtbox_shape[:2]) + gtbox = gtbox * gtmask[:, :, np.newaxis] + gtlabel = gtlabel * gtmask + + gtscore = np.ones(self.gtbox_shape[:2]).astype('float64') + if self.gtscore: + gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64') + + target = [] + for box, label, score in zip(gtbox, gtlabel, gtscore): + target.append( + gt2yolotarget(box, label, score, self.anchors, self.anchor_mask, + self.class_num, (self.h, self.w + ), self.downsample_ratio)) + + self.target = np.array(target).astype('float64') + + self.mask_anchors = [] + for i in self.anchor_mask: + self.mask_anchors.extend(self.anchors[i]) + self.x = x + self.gtbox = gtbox + self.gtlabel = gtlabel + self.gtscore = gtscore + + def initTestCase(self): + self.b = 8 + self.h = 19 + self.w = 19 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [6, 7, 8] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 32 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = True + self.use_label_smooth = False + self.scale_x_y = 1. + + def test_loss(self): + x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target + yolo_loss = YOLOv3Loss( + ignore_thresh=self.ignore_thresh, + label_smooth=self.use_label_smooth, + num_classes=self.class_num, + downsample=self.downsample_ratio, + scale_x_y=self.scale_x_y) + x = paddle.to_tensor(x.astype(np.float32)) + gtbox = paddle.to_tensor(gtbox.astype(np.float32)) + gtlabel = paddle.to_tensor(gtlabel.astype(np.float32)) + gtscore = paddle.to_tensor(gtscore.astype(np.float32)) + t = paddle.to_tensor(target.astype(np.float32)) + anchor = [self.anchors[i] for i in self.anchor_mask] + (yolo_loss1, px, py, tx, ty) = fine_grained_loss( + output=x, + target=t, + gt_box=gtbox, + batch_size=self.b, + num_classes=self.class_num, + anchors=self.mask_anchors, + ignore_thresh=self.ignore_thresh, + downsample=self.downsample_ratio, + scale_x_y=self.scale_x_y) + yolo_loss2 = yolo_loss.yolov3_loss( + x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y) + for k in yolo_loss2: + self.assertAlmostEqual( + yolo_loss1[k].numpy()[0], + yolo_loss2[k].numpy()[0], + delta=1e-2, + msg=k) + + +class TestYolov3LossNoGTScore(TestYolov3LossOp): + def initTestCase(self): + self.b = 1 + self.h = 76 + self.w = 76 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [0, 1, 2] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 8 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = False + self.use_label_smooth = False + self.scale_x_y = 1. + + +class TestYolov3LossWithScaleXY(TestYolov3LossOp): + def initTestCase(self): + self.b = 5 + self.h = 38 + self.w = 38 + self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]] + self.anchor_mask = [3, 4, 5] + self.na = len(self.anchor_mask) + self.class_num = 80 + self.ignore_thresh = 0.7 + self.downsample_ratio = 16 + self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), + self.h, self.w) + self.gtbox_shape = (self.b, 40, 4) + self.gtscore = True + self.use_label_smooth = False + self.scale_x_y = 1.2 + + +if __name__ == "__main__": + unittest.main() diff --git a/ppdet/optimizer.py b/ppdet/optimizer.py new file mode 100644 index 0000000..5334eba --- /dev/null +++ b/ppdet/optimizer.py @@ -0,0 +1,259 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import copy +import paddle +import paddle.nn as nn + +import paddle.optimizer as optimizer +from paddle.optimizer.lr import CosineAnnealingDecay +import paddle.regularizer as regularizer +from paddle import cos + +from ppdet.core.workspace import register, serializable + +__all__ = ['LearningRate', 'OptimizerBuilder'] + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@serializable +class CosineDecay(object): + """ + Cosine learning rate decay + + Args: + max_epochs (int): max epochs for the training process. + if you commbine cosine decay with warmup, it is recommended that + the max_iters is much larger than the warmup iter + """ + + def __init__(self, max_epochs=1000, use_warmup=True): + self.max_epochs = max_epochs + self.use_warmup = use_warmup + + def __call__(self, + base_lr=None, + boundary=None, + value=None, + step_per_epoch=None): + assert base_lr is not None, "either base LR or values should be provided" + + max_iters = self.max_epochs * int(step_per_epoch) + + if boundary is not None and value is not None and self.use_warmup: + for i in range(int(boundary[-1]), max_iters): + boundary.append(i) + + decayed_lr = base_lr * 0.5 * ( + math.cos(i * math.pi / max_iters) + 1) + value.append(decayed_lr) + return optimizer.lr.PiecewiseDecay(boundary, value) + + return optimizer.lr.CosineAnnealingDecay(base_lr, T_max=max_iters) + + +@serializable +class PiecewiseDecay(object): + """ + Multi step learning rate decay + + Args: + gamma (float | list): decay factor + milestones (list): steps at which to decay learning rate + """ + + def __init__(self, + gamma=[0.1, 0.01], + milestones=[8, 11], + values=None, + use_warmup=True): + super(PiecewiseDecay, self).__init__() + if type(gamma) is not list: + self.gamma = [] + for i in range(len(milestones)): + self.gamma.append(gamma / 10**i) + else: + self.gamma = gamma + self.milestones = milestones + self.values = values + self.use_warmup = use_warmup + + def __call__(self, + base_lr=None, + boundary=None, + value=None, + step_per_epoch=None): + if boundary is not None and self.use_warmup: + boundary.extend([int(step_per_epoch) * i for i in self.milestones]) + else: + # do not use LinearWarmup + boundary = [int(step_per_epoch) * i for i in self.milestones] + + # self.values is setted directly in config + if self.values is not None: + assert len(self.milestones) + 1 == len(self.values) + return optimizer.lr.PiecewiseDecay(boundary, self.values) + + # value is computed by self.gamma + if value is not None: + for i in self.gamma: + value.append(base_lr * i) + + return optimizer.lr.PiecewiseDecay(boundary, value) + + +@serializable +class LinearWarmup(object): + """ + Warm up learning rate linearly + + Args: + steps (int): warm up steps + start_factor (float): initial learning rate factor + """ + + def __init__(self, steps=500, start_factor=1. / 3): + super(LinearWarmup, self).__init__() + self.steps = steps + self.start_factor = start_factor + + def __call__(self, base_lr): + boundary = [] + value = [] + for i in range(self.steps + 1): + alpha = i / self.steps + factor = self.start_factor * (1 - alpha) + alpha + lr = base_lr * factor + value.append(lr) + if i > 0: + boundary.append(i) + return boundary, value + + +@register +class LearningRate(object): + """ + Learning Rate configuration + + Args: + base_lr (float): base learning rate + schedulers (list): learning rate schedulers + """ + __category__ = 'optim' + + def __init__(self, + base_lr=0.01, + schedulers=[PiecewiseDecay(), LinearWarmup()]): + super(LearningRate, self).__init__() + self.base_lr = base_lr + self.schedulers = schedulers + + def __call__(self, step_per_epoch): + assert len(self.schedulers) >= 1 + if not self.schedulers[0].use_warmup: + return self.schedulers[0](base_lr=self.base_lr, + step_per_epoch=step_per_epoch) + + # TODO: split warmup & decay + # warmup + boundary, value = self.schedulers[1](self.base_lr) + # decay + decay_lr = self.schedulers[0](self.base_lr, boundary, value, + step_per_epoch) + return decay_lr + + +@register +class OptimizerBuilder(): + """ + Build optimizer handles + Args: + regularizer (object): an `Regularizer` instance + optimizer (object): an `Optimizer` instance + """ + __category__ = 'optim' + + def __init__(self, + clip_grad_by_norm=None, + regularizer={'type': 'L2', + 'factor': .0001}, + optimizer={'type': 'Momentum', + 'momentum': .9}): + self.clip_grad_by_norm = clip_grad_by_norm + self.regularizer = regularizer + self.optimizer = optimizer + + def __call__(self, learning_rate, params=None): + if self.clip_grad_by_norm is not None: + grad_clip = nn.ClipGradByGlobalNorm( + clip_norm=self.clip_grad_by_norm) + else: + grad_clip = None + + if self.regularizer: + reg_type = self.regularizer['type'] + 'Decay' + reg_factor = self.regularizer['factor'] + regularization = getattr(regularizer, reg_type)(reg_factor) + else: + regularization = None + + optim_args = self.optimizer.copy() + optim_type = optim_args['type'] + del optim_args['type'] + op = getattr(optimizer, optim_type) + return op(learning_rate=learning_rate, + parameters=params, + weight_decay=regularization, + grad_clip=grad_clip, + **optim_args) + + +class ModelEMA(object): + def __init__(self, decay, model, use_thres_step=False): + self.step = 0 + self.decay = decay + self.state_dict = dict() + for k, v in model.state_dict().items(): + self.state_dict[k] = paddle.zeros_like(v) + self.use_thres_step = use_thres_step + + def update(self, model): + if self.use_thres_step: + decay = min(self.decay, (1 + self.step) / (10 + self.step)) + else: + decay = self.decay + self._decay = decay + model_dict = model.state_dict() + for k, v in self.state_dict.items(): + v = decay * v + (1 - decay) * model_dict[k] + v.stop_gradient = True + self.state_dict[k] = v + self.step += 1 + + def apply(self): + if self.step == 0: + return self.state_dict + state_dict = dict() + for k, v in self.state_dict.items(): + v = v / (1 - self._decay**self.step) + v.stop_gradient = True + state_dict[k] = v + return state_dict diff --git a/ppdet/slim/__init__.py b/ppdet/slim/__init__.py new file mode 100644 index 0000000..ab28664 --- /dev/null +++ b/ppdet/slim/__init__.py @@ -0,0 +1,61 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import prune +from . import quant +from . import distill + +from .prune import * +from .quant import * +from .distill import * + +import yaml +from ppdet.core.workspace import load_config +from ppdet.utils.checkpoint import load_pretrain_weight + + +def build_slim_model(cfg, slim_cfg, mode='train'): + with open(slim_cfg) as f: + slim_load_cfg = yaml.load(f, Loader=yaml.Loader) + if mode != 'train' and slim_load_cfg['slim'] == 'Distill': + return cfg + + if slim_load_cfg['slim'] == 'Distill': + model = DistillModel(cfg, slim_cfg) + cfg['model'] = model + elif slim_load_cfg['slim'] == 'DistillPrune': + if mode == 'train': + model = DistillModel(cfg, slim_cfg) + pruner = create(cfg.pruner) + pruner(model.student_model) + else: + model = create(cfg.architecture) + weights = cfg.weights + load_config(slim_cfg) + pruner = create(cfg.pruner) + model = pruner(model) + load_pretrain_weight(model, weights) + cfg['model'] = model + else: + load_config(slim_cfg) + model = create(cfg.architecture) + if mode == 'train': + load_pretrain_weight(model, cfg.pretrain_weights) + slim = create(cfg.slim) + cfg['model'] = slim(model) + cfg['slim'] = slim + if mode != 'train': + load_pretrain_weight(cfg['model'], cfg.weights) + + return cfg diff --git a/ppdet/slim/__pycache__/__init__.cpython-38.pyc b/ppdet/slim/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..bb67e7b Binary files /dev/null and b/ppdet/slim/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/slim/__pycache__/__init__.cpython-39.pyc b/ppdet/slim/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..abe4a9a Binary files /dev/null and b/ppdet/slim/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/slim/__pycache__/distill.cpython-38.pyc b/ppdet/slim/__pycache__/distill.cpython-38.pyc new file mode 100644 index 0000000..c228fc9 Binary files /dev/null and b/ppdet/slim/__pycache__/distill.cpython-38.pyc differ diff --git a/ppdet/slim/__pycache__/distill.cpython-39.pyc b/ppdet/slim/__pycache__/distill.cpython-39.pyc new file mode 100644 index 0000000..53c1160 Binary files /dev/null and b/ppdet/slim/__pycache__/distill.cpython-39.pyc differ diff --git a/ppdet/slim/__pycache__/prune.cpython-38.pyc b/ppdet/slim/__pycache__/prune.cpython-38.pyc new file mode 100644 index 0000000..04a4841 Binary files /dev/null and b/ppdet/slim/__pycache__/prune.cpython-38.pyc differ diff --git a/ppdet/slim/__pycache__/prune.cpython-39.pyc b/ppdet/slim/__pycache__/prune.cpython-39.pyc new file mode 100644 index 0000000..1d83008 Binary files /dev/null and b/ppdet/slim/__pycache__/prune.cpython-39.pyc differ diff --git a/ppdet/slim/__pycache__/quant.cpython-38.pyc b/ppdet/slim/__pycache__/quant.cpython-38.pyc new file mode 100644 index 0000000..ceee108 Binary files /dev/null and b/ppdet/slim/__pycache__/quant.cpython-38.pyc differ diff --git a/ppdet/slim/__pycache__/quant.cpython-39.pyc b/ppdet/slim/__pycache__/quant.cpython-39.pyc new file mode 100644 index 0000000..97360bf Binary files /dev/null and b/ppdet/slim/__pycache__/quant.cpython-39.pyc differ diff --git a/ppdet/slim/distill.py b/ppdet/slim/distill.py new file mode 100644 index 0000000..d5c9d72 --- /dev/null +++ b/ppdet/slim/distill.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register, serializable, load_config +from ppdet.core.workspace import create +from ppdet.utils.logger import setup_logger +from ppdet.modeling import ops +from ppdet.utils.checkpoint import load_pretrain_weight +from ppdet.modeling.losses import YOLOv3Loss +logger = setup_logger(__name__) + + +class DistillModel(nn.Layer): + def __init__(self, cfg, slim_cfg): + super(DistillModel, self).__init__() + + self.student_model = create(cfg.architecture) + logger.debug('Load student model pretrain_weights:{}'.format( + cfg.pretrain_weights)) + load_pretrain_weight(self.student_model, cfg.pretrain_weights) + + slim_cfg = load_config(slim_cfg) + self.teacher_model = create(slim_cfg.architecture) + self.distill_loss = create(slim_cfg.distill_loss) + logger.debug('Load teacher model pretrain_weights:{}'.format( + slim_cfg.pretrain_weights)) + load_pretrain_weight(self.teacher_model, slim_cfg.pretrain_weights) + + for param in self.teacher_model.parameters(): + param.trainable = False + + def parameters(self): + return self.student_model.parameters() + + def forward(self, inputs): + if self.training: + teacher_loss = self.teacher_model(inputs) + student_loss = self.student_model(inputs) + loss = self.distill_loss(self.teacher_model, self.student_model) + student_loss['distill_loss'] = loss + student_loss['teacher_loss'] = teacher_loss['loss'] + student_loss['loss'] += student_loss['distill_loss'] + return student_loss + else: + return self.student_model(inputs) + + +@register +class DistillYOLOv3Loss(nn.Layer): + def __init__(self, weight=1000): + super(DistillYOLOv3Loss, self).__init__() + self.weight = weight + + def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj): + loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx)) + loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty)) + loss_w = paddle.abs(sw - tw) + loss_h = paddle.abs(sh - th) + loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h]) + weighted_loss = paddle.mean(loss * F.sigmoid(tobj)) + return weighted_loss + + def obj_weighted_cls(self, scls, tcls, tobj): + loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls)) + weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj))) + return weighted_loss + + def obj_loss(self, sobj, tobj): + obj_mask = paddle.cast(tobj > 0., dtype="float32") + obj_mask.stop_gradient = True + loss = paddle.mean( + ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask)) + return loss + + def forward(self, teacher_model, student_model): + teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs + student_distill_pairs = student_model.yolo_head.loss.distill_pairs + distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], [] + for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs): + distill_reg_loss.append( + self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[ + 3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4])) + distill_cls_loss.append( + self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4])) + distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4])) + distill_reg_loss = paddle.add_n(distill_reg_loss) + distill_cls_loss = paddle.add_n(distill_cls_loss) + distill_obj_loss = paddle.add_n(distill_obj_loss) + loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss + ) * self.weight + return loss diff --git a/ppdet/slim/prune.py b/ppdet/slim/prune.py new file mode 100644 index 0000000..2d01e30 --- /dev/null +++ b/ppdet/slim/prune.py @@ -0,0 +1,85 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle.utils import try_import + +from ppdet.core.workspace import register, serializable +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +def print_prune_params(model): + model_dict = model.state_dict() + for key in model_dict.keys(): + weight_name = model_dict[key].name + logger.info('Parameter name: {}, shape: {}'.format( + weight_name, model_dict[key].shape)) + + +@register +@serializable +class Pruner(object): + def __init__(self, + criterion, + pruned_params, + pruned_ratios, + print_params=False): + super(Pruner, self).__init__() + assert criterion in ['l1_norm', 'fpgm'], \ + "unsupported prune criterion: {}".format(criterion) + self.criterion = criterion + self.pruned_params = pruned_params + self.pruned_ratios = pruned_ratios + self.print_params = print_params + + def __call__(self, model): + # FIXME: adapt to network graph when Training and inference are + # inconsistent, now only supports prune inference network graph. + model.eval() + paddleslim = try_import('paddleslim') + from paddleslim.analysis import dygraph_flops as flops + input_spec = [{ + "image": paddle.ones( + shape=[1, 3, 640, 640], dtype='float32'), + "im_shape": paddle.full( + [1, 2], 640, dtype='float32'), + "scale_factor": paddle.ones( + shape=[1, 2], dtype='float32') + }] + if self.print_params: + print_prune_params(model) + + ori_flops = flops(model, input_spec) / 1000 + logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) + if self.criterion == 'fpgm': + pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) + elif self.criterion == 'l1_norm': + pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) + + logger.info("pruned params: {}".format(self.pruned_params)) + pruned_ratios = [float(n) for n in self.pruned_ratios] + ratios = {} + for i, param in enumerate(self.pruned_params): + ratios[param] = pruned_ratios[i] + pruner.prune_vars(ratios, [0]) + pruned_flops = flops(model, input_spec) / 1000 + logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( + pruned_flops, (ori_flops - pruned_flops) / ori_flops)) + + return model diff --git a/ppdet/slim/quant.py b/ppdet/slim/quant.py new file mode 100644 index 0000000..a1fe126 --- /dev/null +++ b/ppdet/slim/quant.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle.utils import try_import + +from ppdet.core.workspace import register, serializable +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class QAT(object): + def __init__(self, quant_config, print_model): + super(QAT, self).__init__() + self.quant_config = quant_config + self.print_model = print_model + + def __call__(self, model): + paddleslim = try_import('paddleslim') + self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) + if self.print_model: + logger.info("Model before quant:") + logger.info(model) + + self.quanter.quantize(model) + + if self.print_model: + logger.info("Quantized model:") + logger.info(model) + + return model + + def save_quantized_model(self, layer, path, input_spec=None, **config): + self.quanter.save_quantized_model( + model=layer, path=path, input_spec=input_spec, **config) diff --git a/ppdet/utils/__init__.py b/ppdet/utils/__init__.py new file mode 100644 index 0000000..d0c32e2 --- /dev/null +++ b/ppdet/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/utils/__pycache__/__init__.cpython-38.pyc b/ppdet/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..37da40e Binary files /dev/null and b/ppdet/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/__init__.cpython-39.pyc b/ppdet/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..14afafc Binary files /dev/null and b/ppdet/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/check.cpython-39.pyc b/ppdet/utils/__pycache__/check.cpython-39.pyc new file mode 100644 index 0000000..16881dd Binary files /dev/null and b/ppdet/utils/__pycache__/check.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/checkpoint.cpython-38.pyc b/ppdet/utils/__pycache__/checkpoint.cpython-38.pyc new file mode 100644 index 0000000..d418ce6 Binary files /dev/null and b/ppdet/utils/__pycache__/checkpoint.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/checkpoint.cpython-39.pyc b/ppdet/utils/__pycache__/checkpoint.cpython-39.pyc new file mode 100644 index 0000000..3372443 Binary files /dev/null and b/ppdet/utils/__pycache__/checkpoint.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/cli.cpython-39.pyc b/ppdet/utils/__pycache__/cli.cpython-39.pyc new file mode 100644 index 0000000..a8e4de8 Binary files /dev/null and b/ppdet/utils/__pycache__/cli.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/colormap.cpython-38.pyc b/ppdet/utils/__pycache__/colormap.cpython-38.pyc new file mode 100644 index 0000000..462b26d Binary files /dev/null and b/ppdet/utils/__pycache__/colormap.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/colormap.cpython-39.pyc b/ppdet/utils/__pycache__/colormap.cpython-39.pyc new file mode 100644 index 0000000..9d7e8d5 Binary files /dev/null and b/ppdet/utils/__pycache__/colormap.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/download.cpython-38.pyc b/ppdet/utils/__pycache__/download.cpython-38.pyc new file mode 100644 index 0000000..e219b8a Binary files /dev/null and b/ppdet/utils/__pycache__/download.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/download.cpython-39.pyc b/ppdet/utils/__pycache__/download.cpython-39.pyc new file mode 100644 index 0000000..37d0b3f Binary files /dev/null and b/ppdet/utils/__pycache__/download.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/logger.cpython-38.pyc b/ppdet/utils/__pycache__/logger.cpython-38.pyc new file mode 100644 index 0000000..301b21c Binary files /dev/null and b/ppdet/utils/__pycache__/logger.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/logger.cpython-39.pyc b/ppdet/utils/__pycache__/logger.cpython-39.pyc new file mode 100644 index 0000000..417ddbe Binary files /dev/null and b/ppdet/utils/__pycache__/logger.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/stats.cpython-38.pyc b/ppdet/utils/__pycache__/stats.cpython-38.pyc new file mode 100644 index 0000000..8476fa0 Binary files /dev/null and b/ppdet/utils/__pycache__/stats.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/stats.cpython-39.pyc b/ppdet/utils/__pycache__/stats.cpython-39.pyc new file mode 100644 index 0000000..784f472 Binary files /dev/null and b/ppdet/utils/__pycache__/stats.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/visualizer.cpython-38.pyc b/ppdet/utils/__pycache__/visualizer.cpython-38.pyc new file mode 100644 index 0000000..64af338 Binary files /dev/null and b/ppdet/utils/__pycache__/visualizer.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/visualizer.cpython-39.pyc b/ppdet/utils/__pycache__/visualizer.cpython-39.pyc new file mode 100644 index 0000000..e5ef7c8 Binary files /dev/null and b/ppdet/utils/__pycache__/visualizer.cpython-39.pyc differ diff --git a/ppdet/utils/__pycache__/voc_utils.cpython-38.pyc b/ppdet/utils/__pycache__/voc_utils.cpython-38.pyc new file mode 100644 index 0000000..d2ff806 Binary files /dev/null and b/ppdet/utils/__pycache__/voc_utils.cpython-38.pyc differ diff --git a/ppdet/utils/__pycache__/voc_utils.cpython-39.pyc b/ppdet/utils/__pycache__/voc_utils.cpython-39.pyc new file mode 100644 index 0000000..a115831 Binary files /dev/null and b/ppdet/utils/__pycache__/voc_utils.cpython-39.pyc differ diff --git a/ppdet/utils/check.py b/ppdet/utils/check.py new file mode 100644 index 0000000..3a3bcf7 --- /dev/null +++ b/ppdet/utils/check.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle +import six +import paddle.version as fluid_version + +from .logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['check_gpu', 'check_version', 'check_config'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not paddle.is_compiled_with_cuda(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(version='2.0'): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version {} or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code.".format(version) + + version_installed = [ + fluid_version.major, fluid_version.minor, fluid_version.patch, + fluid_version.rc + ] + if version_installed == ['0', '0', '0', '0']: + return + version_split = version.split('.') + + length = min(len(version_installed), len(version_split)) + for i in six.moves.range(length): + if version_installed[i] > version_split[i]: + return + if version_installed[i] < version_split[i]: + raise Exception(err) + + +def check_config(cfg): + """ + Check the correctness of the configuration file. Log error and exit + when Config is not compliant. + """ + err = "'{}' not specified in config file. Please set it in config file." + check_list = ['architecture', 'num_classes'] + try: + for var in check_list: + if not var in cfg: + logger.error(err.format(var)) + sys.exit(1) + except Exception as e: + pass + + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + return cfg diff --git a/ppdet/utils/checkpoint.py b/ppdet/utils/checkpoint.py new file mode 100644 index 0000000..d4f0809 --- /dev/null +++ b/ppdet/utils/checkpoint.py @@ -0,0 +1,206 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import errno +import os +import time +import re +import numpy as np +import paddle +import paddle.nn as nn +from .download import get_weights_path + +from .logger import setup_logger +logger = setup_logger(__name__) + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith('http://') \ + or path.startswith('https://') \ + or path.startswith('ppdet://') + + +def _get_unique_endpoints(trainer_endpoints): + # Sorting is to avoid different environmental variables for each card + trainer_endpoints.sort() + ips = set() + unique_endpoints = set() + for endpoint in trainer_endpoints: + ip = endpoint.split(":")[0] + if ip in ips: + continue + ips.add(ip) + unique_endpoints.add(endpoint) + logger.info("unique_endpoints {}".format(unique_endpoints)) + return unique_endpoints + + +def get_weights_path_dist(path): + env = os.environ + if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: + trainer_id = int(env['PADDLE_TRAINER_ID']) + num_trainers = int(env['PADDLE_TRAINERS_NUM']) + if num_trainers <= 1: + path = get_weights_path(path) + else: + from ppdet.utils.download import map_path, WEIGHTS_HOME + weight_path = map_path(path, WEIGHTS_HOME) + lock_path = weight_path + '.lock' + if not os.path.exists(weight_path): + from paddle.distributed import ParallelEnv + unique_endpoints = _get_unique_endpoints(ParallelEnv() + .trainer_endpoints[:]) + try: + os.makedirs(os.path.dirname(weight_path)) + except OSError as e: + if e.errno != errno.EEXIST: + raise + with open(lock_path, 'w'): # touch + os.utime(lock_path, None) + if ParallelEnv().current_endpoint in unique_endpoints: + get_weights_path(path) + os.remove(lock_path) + else: + while os.path.exists(lock_path): + time.sleep(1) + path = weight_path + else: + path = get_weights_path(path) + + return path + + +def _strip_postfix(path): + path, ext = os.path.splitext(path) + assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ + "Unknown postfix {} from weights".format(ext) + return path + + +def load_weight(model, weight, optimizer=None): + if is_url(weight): + weight = get_weights_path_dist(weight) + + path = _strip_postfix(weight) + pdparam_path = path + '.pdparams' + if not os.path.exists(pdparam_path): + raise ValueError("Model pretrain path {} does not " + "exists.".format(pdparam_path)) + + param_state_dict = paddle.load(pdparam_path) + model_dict = model.state_dict() + model_weight = {} + incorrect_keys = 0 + + for key in model_dict.keys(): + if key in param_state_dict.keys(): + model_weight[key] = param_state_dict[key] + else: + logger.info('Unmatched key: {}'.format(key)) + incorrect_keys += 1 + + assert incorrect_keys == 0, "Load weight {} incorrectly, \ + {} keys unmatched, please check again.".format(weight, + incorrect_keys) + logger.info('Finish resuming model weights: {}'.format(pdparam_path)) + + model.set_dict(model_weight) + + last_epoch = 0 + if optimizer is not None and os.path.exists(path + '.pdopt'): + optim_state_dict = paddle.load(path + '.pdopt') + # to solve resume bug, will it be fixed in paddle 2.0 + for key in optimizer.state_dict().keys(): + if not key in optim_state_dict.keys(): + optim_state_dict[key] = optimizer.state_dict()[key] + if 'last_epoch' in optim_state_dict: + last_epoch = optim_state_dict.pop('last_epoch') + optimizer.set_state_dict(optim_state_dict) + + return last_epoch + + +def load_pretrain_weight(model, pretrain_weight): + if is_url(pretrain_weight): + pretrain_weight = get_weights_path_dist(pretrain_weight) + + path = _strip_postfix(pretrain_weight) + if not (os.path.isdir(path) or os.path.isfile(path) or + os.path.exists(path + '.pdparams')): + raise ValueError("Model pretrain path `{}` does not exists. " + "If you don't want to load pretrain model, " + "please delete `pretrain_weights` field in " + "config file.".format(path)) + + model_dict = model.state_dict() + + weights_path = path + '.pdparams' + param_state_dict = paddle.load(weights_path) + ignore_weights = set() + + for name, weight in param_state_dict.items(): + if name in model_dict.keys(): + if list(weight.shape) != list(model_dict[name].shape): + logger.info( + '{} not used, shape {} unmatched with {} in model.'.format( + name, weight.shape, list(model_dict[name].shape))) + ignore_weights.add(name) + else: + logger.info('Redundant weight {} and ignore it.'.format(name)) + ignore_weights.add(name) + + for weight in ignore_weights: + param_state_dict.pop(weight, None) + + model.set_dict(param_state_dict) + logger.info('Finish loading model weights: {}'.format(weights_path)) + + +def save_model(model, optimizer, save_dir, save_name, last_epoch): + """ + save model into disk. + + Args: + model (paddle.nn.Layer): the Layer instalce to save parameters. + optimizer (paddle.optimizer.Optimizer): the Optimizer instance to + save optimizer states. + save_dir (str): the directory to be saved. + save_name (str): the path to be saved. + last_epoch (int): the epoch index. + """ + if paddle.distributed.get_rank() != 0: + return + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = os.path.join(save_dir, save_name) + if isinstance(model, nn.Layer): + paddle.save(model.state_dict(), save_path + ".pdparams") + else: + assert isinstance(model, + dict), 'model is not a instance of nn.layer or dict' + paddle.save(model, save_path + ".pdparams") + state_dict = optimizer.state_dict() + state_dict['last_epoch'] = last_epoch + paddle.save(state_dict, save_path + ".pdopt") + logger.info("Save checkpoint: {}".format(save_dir)) diff --git a/ppdet/utils/cli.py b/ppdet/utils/cli.py new file mode 100644 index 0000000..b8ba59d --- /dev/null +++ b/ppdet/utils/cli.py @@ -0,0 +1,151 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import yaml +import re +from ppdet.core.workspace import get_registered_modules, dump_value + +__all__ = ['ColorTTY', 'ArgsParser'] + + +class ColorTTY(object): + def __init__(self): + super(ColorTTY, self).__init__() + self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan'] + + def __getattr__(self, attr): + if attr in self.colors: + color = self.colors.index(attr) + 31 + + def color_message(message): + return "[{}m{}".format(color, message) + + setattr(self, attr, color_message) + return color_message + + def bold(self, message): + return self.with_code('01', message) + + def with_code(self, code, message): + return "[{}m{}".format(code, message) + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument("-c", "--config", help="configuration file to use") + self.add_argument( + "-o", "--opt", nargs='*', help="set configuration options") + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config is not None, \ + "Please specify --config=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=', 1) + if '.' not in k: + config[k] = yaml.load(v, Loader=yaml.Loader) + else: + keys = k.split('.') + if keys[0] not in config: + config[keys[0]] = {} + cur = config[keys[0]] + for idx, key in enumerate(keys[1:]): + if idx == len(keys) - 2: + cur[key] = yaml.load(v, Loader=yaml.Loader) + else: + cur[key] = {} + cur = cur[key] + return config + + +def print_total_cfg(config): + modules = get_registered_modules() + color_tty = ColorTTY() + green = '___{}___'.format(color_tty.colors.index('green') + 31) + + styled = {} + for key in config.keys(): + if not config[key]: # empty schema + continue + + if key not in modules and not hasattr(config[key], '__dict__'): + styled[key] = config[key] + continue + elif key in modules: + module = modules[key] + else: + type_name = type(config[key]).__name__ + if type_name in modules: + module = modules[type_name].copy() + module.update({ + k: v + for k, v in config[key].__dict__.items() + if k in module.schema + }) + key += " ({})".format(type_name) + default = module.find_default_keys() + missing = module.find_missing_keys() + mismatch = module.find_mismatch_keys() + extra = module.find_extra_keys() + dep_missing = [] + for dep in module.inject: + if isinstance(module[dep], str) and module[dep] != '': + if module[dep] not in modules: # not a valid module + dep_missing.append(dep) + else: + dep_mod = modules[module[dep]] + # empty dict but mandatory + if not dep_mod and dep_mod.mandatory(): + dep_missing.append(dep) + override = list( + set(module.keys()) - set(default) - set(extra) - set(dep_missing)) + replacement = {} + for name in set(override + default + extra + mismatch + missing): + new_name = name + if name in missing: + value = "" + else: + value = module[name] + + if name in extra: + value = dump_value(value) + " " + elif name in mismatch: + value = dump_value(value) + " " + elif name in dep_missing: + value = dump_value(value) + " " + elif name in override and value != '': + mark = green + new_name = mark + name + replacement[new_name] = value + styled[key] = replacement + buffer = yaml.dump(styled, default_flow_style=False, default_style='') + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", r"[33m[0m", buffer)) + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", + r"[31m[0m", buffer)) + buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer) + print(buffer) diff --git a/ppdet/utils/colormap.py b/ppdet/utils/colormap.py new file mode 100644 index 0000000..566185e --- /dev/null +++ b/ppdet/utils/colormap.py @@ -0,0 +1,56 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + + +def colormap(rgb=False): + """ + Get colormap + """ + color_list = np.array([ + 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, + 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, + 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, + 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, + 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, + 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, + 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, + 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, + 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, + 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, + 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, + 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, + 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, + 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, + 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 + ]).astype(np.float32) + color_list = color_list.reshape((-1, 3)) * 255 + if not rgb: + color_list = color_list[:, ::-1] + return color_list diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py new file mode 100644 index 0000000..99635c7 --- /dev/null +++ b/ppdet/utils/download.py @@ -0,0 +1,461 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import yaml +import shutil +import requests +import tqdm +import hashlib +import base64 +import binascii +import tarfile +import zipfile + +from .voc_utils import create_list +from ppdet.core.workspace import BASE_KEY + +from .logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'get_weights_path', 'get_dataset_path', 'get_config_path', + 'download_dataset', 'create_voc_list' +] + +WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") +DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") +CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs") + +# dict of {dataset_name: (download_info, sub_dirs)} +# download info: [(url, md5sum)] +DATASETS = { + 'coco': ([ + ( + 'http://images.cocodataset.org/zips/train2017.zip', + 'cced6f7f71b7629ddf16f17bbcfab6b2', ), + ( + 'http://images.cocodataset.org/zips/val2017.zip', + '442b8da7639aecaf257c1dceb8ba8c80', ), + ( + 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', + 'f4bbac642086de4f52a3fdda2de5fa2c', ), + ], ["annotations", "train2017", "val2017"]), + 'voc': ([ + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', + '6cd6e144f989b92b3379bac3b3de84fd', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', + 'c52e279531787c972589f7e41ab4ae64', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', + 'b6e924de25625d8de591ea690078ad9f', ), + ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]), + 'wider_face': ([ + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip', + '3fedf70df600953d25982bcd13d91ba2', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip', + 'dfa7d7e790efa35df3788964cf0bbaea', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip', + 'a4a898d6193db4b9ef3260a68bad0dc7', ), + ], ["WIDER_train", "WIDER_val", "wider_face_split"]), + 'fruit': ([( + 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar', + 'baa8806617a54ccf3685fa7153388ae6', ), ], + ['Annotations', 'JPEGImages']), + 'roadsign_voc': ([( + 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar', + '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']), + 'roadsign_coco': ([( + 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar', + '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']), + 'objects365': (), +} + +DOWNLOAD_RETRY_LIMIT = 3 + +PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/' + + +def parse_url(url): + url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX) + return url + + +def get_weights_path(url): + """Get weights path from WEIGHTS_HOME, if not exists, + download it from url. + """ + url = parse_url(url) + path, _ = get_path(url, WEIGHTS_HOME) + return path + + +def get_config_path(url): + """Get weights path from CONFIGS_HOME, if not exists, + download it from url. + """ + url = parse_url(url) + path, _ = get_path(url, CONFIGS_HOME) + _download_config(path, url, CONFIGS_HOME) + + return path + + +def _download_config(cfg_path, cfg_url, cur_dir): + with open(cfg_path) as f: + cfg = yaml.load(f, Loader=yaml.Loader) + + # download dependence base ymls + if BASE_KEY in cfg: + base_ymls = list(cfg[BASE_KEY]) + for base_yml in base_ymls: + if base_yml.startswith("~"): + base_yml = os.path.expanduser(base_yml) + relpath = osp.relpath(base_yml, cfg_path) + if not base_yml.startswith('/'): + relpath = base_yml + base_yml = os.path.join(os.path.dirname(cfg_path), base_yml) + + if osp.isfile(base_yml): + logger.debug("Found _BASE_ config: {}".format(base_yml)) + continue + + # download to CONFIGS_HOME firstly + base_yml_url = osp.join(osp.split(cfg_url)[0], relpath) + path, _ = get_path(base_yml_url, CONFIGS_HOME) + + # move from CONFIGS_HOME to dst_path to restore config directory structure + dst_path = osp.join(cur_dir, relpath) + dst_dir = osp.split(dst_path)[0] + if not osp.isdir(dst_dir): + os.makedirs(dst_dir) + shutil.move(path, dst_path) + + # perfrom download base yml recursively + _download_config(dst_path, base_yml_url, osp.split(dst_path)[0]) + + +def get_dataset_path(path, annotation, image_dir): + """ + If path exists, return path. + Otherwise, get dataset path from DATASET_HOME, if not exists, + download it. + """ + if _dataset_exists(path, annotation, image_dir): + return path + + logger.info("Dataset {} is not valid for reason above, try searching {} or " + "downloading dataset...".format( + osp.realpath(path), DATASET_HOME)) + + data_name = os.path.split(path.strip().lower())[-1] + for name, dataset in DATASETS.items(): + if data_name == name: + logger.debug("Parse dataset_dir {} as dataset " + "{}".format(path, name)) + if name == 'objects365': + raise NotImplementedError( + "Dataset {} is not valid for download automatically. " + "Please apply and download the dataset from " + "https://www.objects365.org/download.html".format(name)) + data_dir = osp.join(DATASET_HOME, name) + # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007 + if name in ['voc', 'fruit', 'roadsign_voc']: + exists = True + for sub_dir in dataset[1]: + check_dir = osp.join(data_dir, sub_dir) + if osp.exists(check_dir): + logger.info("Found {}".format(check_dir)) + else: + exists = False + if exists: + return data_dir + + # voc exist is checked above, voc is not exist here + check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc' + for url, md5sum in dataset[0]: + get_path(url, data_dir, md5sum, check_exist) + + # voc should create list after download + if name == 'voc': + create_voc_list(data_dir) + return data_dir + + # not match any dataset in DATASETS + raise ValueError( + "Dataset {} is not valid and cannot parse dataset type " + "'{}' for automaticly downloading, which only supports " + "'voc' , 'coco', 'wider_face', 'fruit' and 'roadsign_voc' currently". + format(path, osp.split(path)[-1])) + + +def create_voc_list(data_dir, devkit_subdir='VOCdevkit'): + logger.debug("Create voc file list...") + devkit_dir = osp.join(data_dir, devkit_subdir) + years = ['2007', '2012'] + + # NOTE: since using auto download VOC + # dataset, VOC default label list should be used, + # do not generate label_list.txt here. For default + # label, see ../data/source/voc.py + create_list(devkit_dir, years, data_dir) + logger.debug("Create voc file list finished") + + +def map_path(url, root_dir): + # parse path after download to decompress under root_dir + fname = osp.split(url)[-1] + zip_formats = ['.zip', '.tar', '.gz'] + fpath = fname + for zip_format in zip_formats: + fpath = fpath.replace(zip_format, '') + return osp.join(root_dir, fpath) + + +def get_path(url, root_dir, md5sum=None, check_exist=True): + """ Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url and decompress it, return the path. + + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME or DATASET_HOME + md5sum (str): md5 sum of download package + """ + # parse path after download to decompress under root_dir + fullpath = map_path(url, root_dir) + + # For same zip file, decompressed directory name different + # from zip file name, rename by following map + decompress_name_map = { + "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012", + "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007", + "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007", + "annotations_trainval": "annotations" + } + for k, v in decompress_name_map.items(): + if fullpath.find(k) >= 0: + fullpath = osp.join(osp.split(fullpath)[0], v) + + if osp.exists(fullpath) and check_exist: + if not osp.isfile(fullpath) or \ + _check_exist_file_md5(fullpath, md5sum, url): + logger.debug("Found {}".format(fullpath)) + return fullpath, True + else: + os.remove(fullpath) + + fullname = _download(url, root_dir, md5sum) + + # new weights format which postfix is 'pdparams' not + # need to decompress + if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml']: + _decompress(fullname) + + return fullpath, False + + +def download_dataset(path, dataset=None): + if dataset not in DATASETS.keys(): + logger.error("Unknown dataset {}, it should be " + "{}".format(dataset, DATASETS.keys())) + return + dataset_info = DATASETS[dataset][0] + for info in dataset_info: + get_path(info[0], path, info[1], False) + logger.debug("Download dataset {} finished.".format(dataset)) + + +def _dataset_exists(path, annotation, image_dir): + """ + Check if user define dataset exists + """ + if not osp.exists(path): + logger.debug("Config dataset_dir {} is not exits, " + "dataset config is not valid".format(path)) + return False + + if annotation: + annotation_path = osp.join(path, annotation) + if not osp.isfile(annotation_path): + logger.debug("Config annotation {} is not a " + "file, dataset config is not " + "valid".format(annotation_path)) + return False + if image_dir: + image_path = osp.join(path, image_dir) + if not osp.isdir(image_path): + logger.warning("Config image_dir {} is not a " + "directory, dataset config is not " + "valid".format(image_path)) + return False + return True + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + + url (str): download url + path (str): download to given path + """ + if not osp.exists(path): + os.makedirs(path) + + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, + url)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError("Download from {} failed. " + "Retry limit reached".format(url)) + + logger.info("Downloading {} from {}".format(fname, url)) + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError("Downloading from {} failed with code " + "{}!".format(url, req.status_code)) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get('content-length') + with open(tmp_fullname, 'wb') as f: + if total_size: + for chunk in tqdm.tqdm( + req.iter_content(chunk_size=1024), + total=(int(total_size) + 1023) // 1024, + unit='KB'): + f.write(chunk) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + return fullname + + +def _check_exist_file_md5(filename, md5sum, url): + # if md5sum is None, and file to check is weights file, + # read md5um from url and check, else check md5sum directly + return _md5check_from_url(filename, url) if md5sum is None \ + and filename.endswith('pdparams') \ + else _md5check(filename, md5sum) + + +def _md5check_from_url(filename, url): + # For weights in bcebos URLs, MD5 value is contained + # in request header as 'content_md5' + req = requests.get(url, stream=True) + content_md5 = req.headers.get('content-md5') + req.close() + if not content_md5 or _md5check( + filename, + binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( + )): + return True + else: + return False + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + logger.debug("File {} md5 checking...".format(fullname)) + md5 = hashlib.md5() + with open(fullname, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + logger.warning("File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum)) + return False + return True + + +def _decompress(fname): + """ + Decompress for zip and tar file + """ + logger.info("Decompressing {}...".format(fname)) + + # For protecting decompressing interupted, + # decompress to fpath_tmp directory firstly, if decompress + # successed, move decompress files to fpath and delete + # fpath_tmp and remove download compress file. + fpath = osp.split(fname)[0] + fpath_tmp = osp.join(fpath, 'tmp') + if osp.isdir(fpath_tmp): + shutil.rmtree(fpath_tmp) + os.makedirs(fpath_tmp) + + if fname.find('tar') >= 0: + with tarfile.open(fname) as tf: + tf.extractall(path=fpath_tmp) + elif fname.find('zip') >= 0: + with zipfile.ZipFile(fname) as zf: + zf.extractall(path=fpath_tmp) + else: + raise TypeError("Unsupport compress file type {}".format(fname)) + + for f in os.listdir(fpath_tmp): + src_dir = osp.join(fpath_tmp, f) + dst_dir = osp.join(fpath, f) + _move_and_merge_tree(src_dir, dst_dir) + + shutil.rmtree(fpath_tmp) + os.remove(fname) + + +def _move_and_merge_tree(src, dst): + """ + Move src directory to dst, if dst is already exists, + merge src to dst + """ + if not osp.exists(dst): + shutil.move(src, dst) + elif osp.isfile(src): + shutil.move(src, dst) + else: + for fp in os.listdir(src): + src_fp = osp.join(src, fp) + dst_fp = osp.join(dst, fp) + if osp.isdir(src_fp): + if osp.isdir(dst_fp): + _move_and_merge_tree(src_fp, dst_fp) + else: + shutil.move(src_fp, dst_fp) + elif osp.isfile(src_fp) and \ + not osp.isfile(dst_fp): + shutil.move(src_fp, dst_fp) diff --git a/ppdet/utils/logger.py b/ppdet/utils/logger.py new file mode 100644 index 0000000..99b82f9 --- /dev/null +++ b/ppdet/utils/logger.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import os +import sys + +import paddle.distributed as dist + +__all__ = ['setup_logger'] + +logger_initialized = [] + + +def setup_logger(name="ppdet", output=None): + """ + Initialize logger and set its verbosity level to INFO. + Args: + output (str): a file name or a directory to save log. If None, will not save log file. + If ends with ".txt" or ".log", assumed to be a file name. + Otherwise, logs will be saved to `output/log.txt`. + name (str): the root module name of this logger + + Returns: + logging.Logger: a logger + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + + logger.setLevel(logging.INFO) + logger.propagate = False + + formatter = logging.Formatter( + "[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + # stdout logging: master only + local_rank = dist.get_rank() + if local_rank == 0: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + ch.setFormatter(formatter) + logger.addHandler(ch) + + # file logging: all workers + if output is not None: + if output.endswith(".txt") or output.endswith(".log"): + filename = output + else: + filename = os.path.join(output, "log.txt") + if local_rank > 0: + filename = filename + ".rank{}".format(local_rank) + os.makedirs(os.path.dirname(filename)) + fh = logging.FileHandler(filename, mode='a') + fh.setLevel(logging.DEBUG) + fh.setFormatter(logging.Formatter()) + logger.addHandler(fh) + logger_initialized.append(name) + return logger diff --git a/ppdet/utils/stats.py b/ppdet/utils/stats.py new file mode 100644 index 0000000..6e4d284 --- /dev/null +++ b/ppdet/utils/stats.py @@ -0,0 +1,95 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np +import datetime + +__all__ = ['SmoothedValue', 'TrainingStats'] + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({avg:.4f})" + self.deque = collections.deque(maxlen=window_size) + self.fmt = fmt + self.total = 0. + self.count = 0 + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + @property + def median(self): + return np.median(self.deque) + + @property + def avg(self): + return np.mean(self.deque) + + @property + def max(self): + return np.max(self.deque) + + @property + def value(self): + return self.deque[-1] + + @property + def global_avg(self): + return self.total / self.count + + def __str__(self): + return self.fmt.format( + median=self.median, avg=self.avg, max=self.max, value=self.value) + + +class TrainingStats(object): + def __init__(self, window_size, delimiter=' '): + self.meters = None + self.window_size = window_size + self.delimiter = delimiter + + def update(self, stats): + if self.meters is None: + self.meters = { + k: SmoothedValue(self.window_size) + for k in stats.keys() + } + for k, v in self.meters.items(): + v.update(stats[k].numpy()) + + def get(self, extras=None): + stats = collections.OrderedDict() + if extras: + for k, v in extras.items(): + stats[k] = v + for k, v in self.meters.items(): + stats[k] = format(v.median, '.6f') + + return stats + + def log(self, extras=None): + d = self.get(extras) + strs = [] + for k, v in d.items(): + strs.append("{}: {}".format(k, str(v))) + return self.delimiter.join(strs) diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py new file mode 100644 index 0000000..ecf9595 --- /dev/null +++ b/ppdet/utils/visualizer.py @@ -0,0 +1,202 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +from PIL import Image, ImageDraw +import cv2 +from .colormap import colormap +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['visualize_results'] + + +def visualize_results(image, + bbox_res, + mask_res, + segm_res, + im_id, + catid2name, + threshold=0.5): + """ + Visualize bbox and mask results + """ + if bbox_res is not None: + image = draw_bbox(image, im_id, catid2name, bbox_res, threshold) + if mask_res is not None: + image = draw_mask(image, im_id, mask_res, threshold) + if segm_res is not None: + image = draw_segm(image, im_id, catid2name, segm_res, threshold) + return image + + +def draw_mask(image, im_id, segms, threshold, alpha=0.7): + """ + Draw mask on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = colormap(rgb=True) + img_array = np.array(image).astype('float32') + for dt in np.array(segms): + if im_id != dt['image_id']: + continue + segm, score = dt['segmentation'], dt['score'] + if score < threshold: + continue + import pycocotools.mask as mask_util + mask = mask_util.decode(segm) * 255 + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + img_array[idx[0], idx[1], :] *= 1.0 - alpha + img_array[idx[0], idx[1], :] += alpha * color_mask + return Image.fromarray(img_array.astype('uint8')) + + +def draw_bbox(image, im_id, catid2name, bboxes, threshold): + """ + Draw bbox on image + """ + draw = ImageDraw.Draw(image) + + catid2color = {} + color_list = colormap(rgb=True)[:40] + for dt in np.array(bboxes): + if im_id != dt['image_id']: + continue + catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] + if score < threshold: + continue + + if catid not in catid2color: + idx = np.random.randint(len(color_list)) + catid2color[catid] = color_list[idx] + color = tuple(catid2color[catid]) + + # draw bbox + if len(bbox) == 4: + # draw bbox + xmin, ymin, w, h = bbox + xmax = xmin + w + ymax = ymin + h + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill=color) + elif len(bbox) == 8: + x1, y1, x2, y2, x3, y3, x4, y4 = bbox + draw.line( + [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], + width=2, + fill=color) + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + else: + logger.error('the shape of bbox must be [M, 4] or [M, 8]!') + + # draw label + text = "{} {:.2f}".format(catid2name[catid], score) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + return image + + +def save_result(save_path, bbox_res, catid2name, threshold): + """ + save result as txt + """ + with open(save_path, 'w') as f: + for dt in bbox_res: + catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] + if score < threshold: + continue + # each bbox result as a line + # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4 + # for bbox: classname score x1 y1 w h + bbox_pred = '{} {} '.format(catid2name[catid], score) + ' '.join( + [str(e) for e in bbox]) + f.write(bbox_pred + '\n') + + +def draw_segm(image, + im_id, + catid2name, + segms, + threshold, + alpha=0.7, + draw_box=True): + """ + Draw segmentation on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = colormap(rgb=True) + img_array = np.array(image).astype('float32') + for dt in np.array(segms): + if im_id != dt['image_id']: + continue + segm, score, catid = dt['segmentation'], dt['score'], dt['category_id'] + if score < threshold: + continue + import pycocotools.mask as mask_util + mask = mask_util.decode(segm) * 255 + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + img_array[idx[0], idx[1], :] *= 1.0 - alpha + img_array[idx[0], idx[1], :] += alpha * color_mask + + if not draw_box: + center_y, center_x = ndimage.measurements.center_of_mass(mask) + label_text = "{}".format(catid2name[catid]) + vis_pos = (max(int(center_x) - 10, 0), int(center_y)) + cv2.putText(img_array, label_text, vis_pos, + cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255)) + else: + mask = mask_util.decode(segm) * 255 + sum_x = np.sum(mask, axis=0) + x = np.where(sum_x > 0.5)[0] + sum_y = np.sum(mask, axis=1) + y = np.where(sum_y > 0.5)[0] + x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] + cv2.rectangle(img_array, (x0, y0), (x1, y1), + tuple(color_mask.astype('int32').tolist()), 1) + bbox_text = '%s %.2f' % (catid2name[catid], score) + t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] + cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0], + y0 - t_size[1] - 3), + tuple(color_mask.astype('int32').tolist()), -1) + cv2.putText( + img_array, + bbox_text, (x0, y0 - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 0.3, (0, 0, 0), + 1, + lineType=cv2.LINE_AA) + + return Image.fromarray(img_array.astype('uint8')) diff --git a/ppdet/utils/voc_utils.py b/ppdet/utils/voc_utils.py new file mode 100644 index 0000000..1a4f326 --- /dev/null +++ b/ppdet/utils/voc_utils.py @@ -0,0 +1,87 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import re +import random +import shutil + +__all__ = ['create_list'] + + +def create_list(devkit_dir, years, output_dir): + """ + create following list: + 1. trainval.txt + 2. test.txt + """ + trainval_list = [] + test_list = [] + for year in years: + trainval, test = _walk_voc_dir(devkit_dir, year, output_dir) + trainval_list.extend(trainval) + test_list.extend(test) + + random.shuffle(trainval_list) + with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: + for item in trainval_list: + ftrainval.write(item[0] + ' ' + item[1] + '\n') + + with open(osp.join(output_dir, 'test.txt'), 'w') as fval: + ct = 0 + for item in test_list: + ct += 1 + fval.write(item[0] + ' ' + item[1] + '\n') + + +def _get_voc_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) + + +def _walk_voc_dir(devkit_dir, year, output_dir): + filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations') + img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages') + trainval_list = [] + test_list = [] + added = set() + + for _, _, files in os.walk(filelist_dir): + for fname in files: + img_ann_list = [] + if re.match(r'[a-z]+_trainval\.txt', fname): + img_ann_list = trainval_list + elif re.match(r'[a-z]+_test\.txt', fname): + img_ann_list = test_list + else: + continue + fpath = osp.join(filelist_dir, fname) + for line in open(fpath): + name_prefix = line.strip().split()[0] + if name_prefix in added: + continue + added.add(name_prefix) + ann_path = osp.join( + osp.relpath(annotation_dir, output_dir), + name_prefix + '.xml') + img_path = osp.join( + osp.relpath(img_dir, output_dir), name_prefix + '.jpg') + img_ann_list.append((img_path, ann_path)) + + return trainval_list, test_list diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8ce34b5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +tqdm +typeguard ; python_version >= '3.4' +visualdl>=2.1.0 +opencv-python +PyYAML +shapely +scipy +terminaltables +pycocotools +setuptools>=42.0.0 diff --git a/tools/anchor_cluster.py b/tools/anchor_cluster.py new file mode 100644 index 0000000..0b339bb --- /dev/null +++ b/tools/anchor_cluster.py @@ -0,0 +1,363 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.anchor_cluster') + +from scipy.cluster.vq import kmeans +import random +import numpy as np +from tqdm import tqdm + +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu, check_version, check_config +from ppdet.core.workspace import load_config, merge_config, create + + +class BaseAnchorCluster(object): + def __init__(self, n, cache_path, cache, verbose=True): + """ + Base Anchor Cluster + + Args: + n (int): number of clusters + cache_path (str): cache directory path + cache (bool): whether using cache + verbose (bool): whether print results + """ + super(BaseAnchorCluster, self).__init__() + self.n = n + self.cache_path = cache_path + self.cache = cache + self.verbose = verbose + + def print_result(self, centers): + raise NotImplementedError('%s.print_result is not available' % + self.__class__.__name__) + + def get_whs(self): + whs_cache_path = os.path.join(self.cache_path, 'whs.npy') + shapes_cache_path = os.path.join(self.cache_path, 'shapes.npy') + if self.cache and os.path.exists(whs_cache_path) and os.path.exists( + shapes_cache_path): + self.whs = np.load(whs_cache_path) + self.shapes = np.load(shapes_cache_path) + return self.whs, self.shapes + whs = np.zeros((0, 2)) + shapes = np.zeros((0, 2)) + self.dataset.parse_dataset() + roidbs = self.dataset.roidbs + for rec in tqdm(roidbs): + h, w = rec['h'], rec['w'] + bbox = rec['gt_bbox'] + wh = bbox[:, 2:4] - bbox[:, 0:2] + 1 + wh = wh / np.array([[w, h]]) + shape = np.ones_like(wh) * np.array([[w, h]]) + whs = np.vstack((whs, wh)) + shapes = np.vstack((shapes, shape)) + + if self.cache: + os.makedirs(self.cache_path, exist_ok=True) + np.save(whs_cache_path, whs) + np.save(shapes_cache_path, shapes) + + self.whs = whs + self.shapes = shapes + return self.whs, self.shapes + + def calc_anchors(self): + raise NotImplementedError('%s.calc_anchors is not available' % + self.__class__.__name__) + + def __call__(self): + self.get_whs() + centers = self.calc_anchors() + if self.verbose: + self.print_result(centers) + return centers + + +class YOLOv2AnchorCluster(BaseAnchorCluster): + def __init__(self, + n, + dataset, + size, + cache_path, + cache, + iters=1000, + verbose=True): + super(YOLOv2AnchorCluster, self).__init__( + n, cache_path, cache, verbose=verbose) + """ + YOLOv2 Anchor Cluster + + Reference: + https://github.com/AlexeyAB/darknet/blob/master/scripts/gen_anchors.py + + Args: + n (int): number of clusters + dataset (DataSet): DataSet instance, VOC or COCO + size (list): [w, h] + cache_path (str): cache directory path + cache (bool): whether using cache + iters (int): kmeans algorithm iters + verbose (bool): whether print results + """ + self.dataset = dataset + self.size = size + self.iters = iters + + def print_result(self, centers): + logger.info('%d anchor cluster result: [w, h]' % self.n) + for w, h in centers: + logger.info('[%d, %d]' % (round(w), round(h))) + + def metric(self, whs, centers): + wh1 = whs[:, None] + wh2 = centers[None] + inter = np.minimum(wh1, wh2).prod(2) + return inter / (wh1.prod(2) + wh2.prod(2) - inter) + + def kmeans_expectation(self, whs, centers, assignments): + dist = self.metric(whs, centers) + new_assignments = dist.argmax(1) + converged = (new_assignments == assignments).all() + return converged, new_assignments + + def kmeans_maximizations(self, whs, centers, assignments): + new_centers = np.zeros_like(centers) + for i in range(centers.shape[0]): + mask = (assignments == i) + if mask.sum(): + new_centers[i, :] = whs[mask].mean(0) + return new_centers + + def calc_anchors(self): + self.whs = self.whs * np.array([self.size]) + # random select k centers + whs, n, iters = self.whs, self.n, self.iters + logger.info('Running kmeans for %d anchors on %d points...' % + (n, len(whs))) + idx = np.random.choice(whs.shape[0], size=n, replace=False) + centers = whs[idx] + assignments = np.zeros(whs.shape[0:1]) * -1 + # kmeans + if n == 1: + return self.kmeans_maximizations(whs, centers, assignments) + + pbar = tqdm(range(iters), desc='Cluster anchors with k-means algorithm') + for _ in pbar: + # E step + converged, assignments = self.kmeans_expectation(whs, centers, + assignments) + if converged: + logger.info('kmeans algorithm has converged') + break + # M step + centers = self.kmeans_maximizations(whs, centers, assignments) + ious = self.metric(whs, centers) + pbar.desc = 'avg_iou: %.4f' % (ious.max(1).mean()) + + centers = sorted(centers, key=lambda x: x[0] * x[1]) + return centers + + +class YOLOv5AnchorCluster(BaseAnchorCluster): + def __init__(self, + n, + dataset, + size, + cache_path, + cache, + iters=300, + gen_iters=1000, + thresh=0.25, + verbose=True): + super(YOLOv5AnchorCluster, self).__init__( + n, cache_path, cache, verbose=verbose) + """ + YOLOv5 Anchor Cluster + + Reference: + https://github.com/ultralytics/yolov5/blob/master/utils/general.py + + Args: + n (int): number of clusters + dataset (DataSet): DataSet instance, VOC or COCO + size (list): [w, h] + cache_path (str): cache directory path + cache (bool): whether using cache + iters (int): iters of kmeans algorithm + gen_iters (int): iters of genetic algorithm + threshold (float): anchor scale threshold + verbose (bool): whether print results + """ + self.dataset = dataset + self.size = size + self.iters = iters + self.gen_iters = gen_iters + self.thresh = thresh + + def print_result(self, centers): + whs = self.whs + centers = centers[np.argsort(centers.prod(1))] + x, best = self.metric(whs, centers) + bpr, aat = ( + best > self.thresh).mean(), (x > self.thresh).mean() * self.n + logger.info( + 'thresh=%.2f: %.4f best possible recall, %.2f anchors past thr' % + (self.thresh, bpr, aat)) + logger.info( + 'n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thresh=%.3f-mean: ' + % (self.n, self.size, x.mean(), best.mean(), + x[x > self.thresh].mean())) + logger.info('%d anchor cluster result: [w, h]' % self.n) + for w, h in centers: + logger.info('[%d, %d]' % (round(w), round(h))) + + def metric(self, whs, centers): + r = whs[:, None] / centers[None] + x = np.minimum(r, 1. / r).min(2) + return x, x.max(1) + + def fitness(self, whs, centers): + _, best = self.metric(whs, centers) + return (best * (best > self.thresh)).mean() + + def calc_anchors(self): + self.whs = self.whs * self.shapes / self.shapes.max( + 1, keepdims=True) * np.array([self.size]) + wh0 = self.whs + i = (wh0 < 3.0).any(1).sum() + if i: + logger.warn('Extremely small objects found. %d of %d' + 'labels are < 3 pixels in width or height' % + (i, len(wh0))) + + wh = wh0[(wh0 >= 2.0).any(1)] + logger.info('Running kmeans for %g anchors on %g points...' % + (self.n, len(wh))) + s = wh.std(0) + centers, dist = kmeans(wh / s, self.n, iter=self.iters) + centers *= s + + f, sh, mp, s = self.fitness(wh, centers), centers.shape, 0.9, 0.1 + pbar = tqdm( + range(self.gen_iters), + desc='Evolving anchors with Genetic Algorithm') + for _ in pbar: + v = np.ones(sh) + while (v == 1).all(): + v = ((np.random.random(sh) < mp) * np.random.random() * + np.random.randn(*sh) * s + 1).clip(0.3, 3.0) + new_centers = (centers.copy() * v).clip(min=2.0) + new_f = self.fitness(wh, new_centers) + if new_f > f: + f, centers = new_f, new_centers.copy() + pbar.desc = 'Evolving anchors with Genetic Algorithm: fitness = %.4f' % f + + return centers + + +def main(): + parser = ArgsParser() + parser.add_argument( + '--n', '-n', default=9, type=int, help='num of clusters') + parser.add_argument( + '--iters', + '-i', + default=1000, + type=int, + help='num of iterations for kmeans') + parser.add_argument( + '--gen_iters', + '-gi', + default=1000, + type=int, + help='num of iterations for genetic algorithm') + parser.add_argument( + '--thresh', + '-t', + default=0.25, + type=float, + help='anchor scale threshold') + parser.add_argument( + '--verbose', '-v', default=True, type=bool, help='whether print result') + parser.add_argument( + '--size', + '-s', + default=None, + type=str, + help='image size: w,h, using comma as delimiter') + parser.add_argument( + '--method', + '-m', + default='v2', + type=str, + help='cluster method, [v2, v5] are supported now') + parser.add_argument( + '--cache_path', default='cache', type=str, help='cache path') + parser.add_argument( + '--cache', action='store_true', help='whether use cache') + FLAGS = parser.parse_args() + + cfg = load_config(FLAGS.config) + merge_config(FLAGS.opt) + check_config(cfg) + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + # check if paddlepaddle version is satisfied + check_version() + + # get dataset + dataset = cfg['TrainDataset'] + if FLAGS.size: + if ',' in FLAGS.size: + size = list(map(int, FLAGS.size.split(','))) + assert len(size) == 2, "the format of size is incorrect" + else: + size = int(FLAGS.size) + size = [size, size] + elif 'inputs_def' in cfg['TrainReader'] and 'image_shape' in cfg[ + 'TrainReader']['inputs_def']: + size = cfg['TrainReader']['inputs_def']['image_shape'][1:] + else: + raise ValueError('size is not specified') + + if FLAGS.method == 'v2': + cluster = YOLOv2AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path, + FLAGS.cache, FLAGS.iters, FLAGS.verbose) + elif FLAGS.method == 'v5': + cluster = YOLOv5AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path, + FLAGS.cache, FLAGS.iters, FLAGS.gen_iters, + FLAGS.thresh, FLAGS.verbose) + else: + raise ValueError('cluster method: %s is not supported' % FLAGS.method) + + anchors = cluster() + + +if __name__ == "__main__": + main() diff --git a/tools/eval.py b/tools/eval.py new file mode 100644 index 0000000..5df7a07 --- /dev/null +++ b/tools/eval.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle + +from ppdet.core.workspace import load_config, merge_config +from ppdet.utils.check import check_gpu, check_version, check_config +from ppdet.utils.cli import ArgsParser +from ppdet.engine import Trainer, init_parallel_env +from ppdet.metrics.coco_utils import json_eval_results +from ppdet.slim import build_slim_model + +from ppdet.utils.logger import setup_logger +logger = setup_logger('eval') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + + parser.add_argument( + '--json_eval', + action='store_true', + default=False, + help='Whether to re eval with already exists bbox.json or mask.json') + + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + + # TODO: bias should be unified + parser.add_argument( + "--bias", + action="store_true", + help="whether add bias or not while getting w and h") + + parser.add_argument( + "--classwise", + action="store_true", + help="whether per-category AP and draw P-R Curve or not.") + + parser.add_argument( + '--save_prediction_only', + action='store_true', + default=False, + help='Whether to save the evaluation results only') + + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + if FLAGS.json_eval: + logger.info( + "In json_eval mode, PaddleDetection will evaluate json files in " + "output_eval directly. And proposal.json, bbox.json and mask.json " + "will be detected by default.") + json_eval_results( + cfg.metric, + json_directory=FLAGS.output_eval, + dataset=cfg['EvalDataset']) + return + + # init parallel environment if nranks > 1 + init_parallel_env() + + # build trainer + trainer = Trainer(cfg, mode='eval') + + # load weights + trainer.load_weights(cfg.weights) + + # training + trainer.evaluate() + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + # TODO: bias should be unified + cfg['bias'] = 1 if FLAGS.bias else 0 + cfg['classwise'] = True if FLAGS.classwise else False + cfg['output_eval'] = FLAGS.output_eval + cfg['save_prediction_only'] = FLAGS.save_prediction_only + merge_config(FLAGS.opt) + + place = paddle.set_device('gpu' if cfg.use_gpu else 'cpu') + + if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn' and not cfg.use_gpu: + cfg['norm_type'] = 'bn' + + if FLAGS.slim_config: + cfg = build_slim_model(cfg, FLAGS.slim_config, mode='eval') + + check_config(cfg) + check_gpu(cfg.use_gpu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/export_model.py b/tools/export_model.py new file mode 100644 index 0000000..8cf3885 --- /dev/null +++ b/tools/export_model.py @@ -0,0 +1,105 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle + +from ppdet.core.workspace import load_config, merge_config +from ppdet.utils.check import check_gpu, check_version, check_config +from ppdet.utils.cli import ArgsParser +from ppdet.engine import Trainer +from ppdet.slim import build_slim_model + +from ppdet.utils.logger import setup_logger +logger = setup_logger('export_model') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--output_dir", + type=str, + default="output_inference", + help="Directory for storing the output model files.") + parser.add_argument( + "--export_serving_model", + type=bool, + default=False, + help="Whether to export serving model or not.") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + # build detector + trainer = Trainer(cfg, mode='test') + + # load weights + trainer.load_weights(cfg.weights) + + # export model + trainer.export(FLAGS.output_dir) + + if FLAGS.export_serving_model: + from paddle_serving_client.io import inference_model_to_serving + model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0] + + inference_model_to_serving( + dirname="{}/{}".format(FLAGS.output_dir, model_name), + serving_server="{}/{}/serving_server".format(FLAGS.output_dir, + model_name), + serving_client="{}/{}/serving_client".format(FLAGS.output_dir, + model_name), + model_filename="model.pdmodel", + params_filename="model.pdiparams") + + +def main(): + paddle.set_device("cpu") + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + # TODO: to be refined in the future + if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn': + FLAGS.opt['norm_type'] = 'bn' + merge_config(FLAGS.opt) + + if FLAGS.slim_config: + cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test') + + check_config(cfg) + check_gpu(cfg.use_gpu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/infer.py b/tools/infer.py new file mode 100644 index 0000000..7ea0d23 --- /dev/null +++ b/tools/infer.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') +import glob + +import paddle +from ppdet.core.workspace import load_config, merge_config +from ppdet.engine import Trainer +from ppdet.utils.check import check_gpu, check_version, check_config +from ppdet.utils.cli import ArgsParser +from ppdet.slim import build_slim_model + +from ppdet.utils.logger import setup_logger +logger = setup_logger('train') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--infer_dir", + type=str, + default=None, + help="Directory for images to perform inference on.") + parser.add_argument( + "--infer_img", + type=str, + default=None, + help="Image path, has higher priority over --infer_dir") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory for storing the output visualization files.") + parser.add_argument( + "--draw_threshold", + type=float, + default=0.5, + help="Threshold to reserve the result for visualization.") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + parser.add_argument( + "--use_vdl", + type=bool, + default=False, + help="Whether to record the data to VisualDL.") + parser.add_argument( + '--vdl_log_dir', + type=str, + default="vdl_log_dir/image", + help='VisualDL logging directory for image.') + parser.add_argument( + "--save_txt", + type=bool, + default=False, + help="Whether to save inference result in txt.") + args = parser.parse_args() + return args + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert infer_img is not None or infer_dir is not None, \ + "--infer_img or --infer_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), \ + "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + return [infer_img] + + images = set() + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), \ + "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) + images = list(images) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + logger.info("Found {} inference images in total.".format(len(images))) + + return images + + +def run(FLAGS, cfg): + # build trainer + trainer = Trainer(cfg, mode='test') + + # load weights + trainer.load_weights(cfg.weights) + + # get inference images + images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) + + # inference + trainer.predict( + images, + draw_threshold=FLAGS.draw_threshold, + output_dir=FLAGS.output_dir, + save_txt=FLAGS.save_txt) + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + cfg['use_vdl'] = FLAGS.use_vdl + cfg['vdl_log_dir'] = FLAGS.vdl_log_dir + merge_config(FLAGS.opt) + + place = paddle.set_device('gpu' if cfg.use_gpu else 'cpu') + + if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn' and not cfg.use_gpu: + cfg['norm_type'] = 'bn' + + if FLAGS.slim_config: + cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test') + + check_config(cfg) + check_gpu(cfg.use_gpu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000..d9ef6d6 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,140 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os, sys +# add python path of PadleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +import random +import numpy as np +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle + +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.utils.checkpoint import load_weight +from ppdet.engine import Trainer, init_parallel_env, set_random_seed, init_fleet_env +from ppdet.slim import build_slim_model + +import ppdet.utils.cli as cli +import ppdet.utils.check as check +from ppdet.utils.logger import setup_logger +logger = setup_logger('train') + + +def parse_args(): + parser = cli.ArgsParser() + parser.add_argument( + "--eval", + action='store_true', + default=False, + help="Whether to perform evaluation in train") + parser.add_argument( + "-r", "--resume", default=None, help="weights path for resume") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + parser.add_argument( + "--enable_ce", + type=bool, + default=False, + help="If set True, enable continuous evaluation job." + "This flag is only used for internal test.") + parser.add_argument( + "--fp16", + action='store_true', + default=False, + help="Enable mixed precision training.") + parser.add_argument( + "--fleet", action='store_true', default=False, help="Use fleet or not") + parser.add_argument( + "--use_vdl", + type=bool, + default=False, + help="whether to record the data to VisualDL.") + parser.add_argument( + '--vdl_log_dir', + type=str, + default="vdl_log_dir/scalar", + help='VisualDL logging directory for scalar.') + parser.add_argument( + '--save_prediction_only', + action='store_true', + default=False, + help='Whether to save the evaluation results only') + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + # init fleet environment + if cfg.fleet: + init_fleet_env() + else: + # init parallel environment if nranks > 1 + init_parallel_env() + + if FLAGS.enable_ce: + set_random_seed(0) + + # build trainer + trainer = Trainer(cfg, mode='train') + + # load weights + if FLAGS.resume is not None: + trainer.resume_weights(FLAGS.resume) + elif 'pretrain_weights' in cfg and cfg.pretrain_weights: + trainer.load_weights(cfg.pretrain_weights) + + # training + trainer.train(FLAGS.eval) + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + cfg['fp16'] = FLAGS.fp16 + cfg['fleet'] = FLAGS.fleet + cfg['use_vdl'] = FLAGS.use_vdl + cfg['vdl_log_dir'] = FLAGS.vdl_log_dir + cfg['save_prediction_only'] = FLAGS.save_prediction_only + merge_config(FLAGS.opt) + + place = paddle.set_device('gpu' if cfg.use_gpu else 'cpu') + + if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn' and not cfg.use_gpu: + cfg['norm_type'] = 'bn' + + if FLAGS.slim_config: + cfg = build_slim_model(cfg, FLAGS.slim_config) + + check.check_config(cfg) + check.check_gpu(cfg.use_gpu) + check.check_version() + + run(FLAGS, cfg) + + +if __name__ == "__main__": + main() diff --git a/tools/x2coco.py b/tools/x2coco.py new file mode 100644 index 0000000..ef2f0d7 --- /dev/null +++ b/tools/x2coco.py @@ -0,0 +1,449 @@ +#!/usr/bin/env python +# coding: utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import glob +import json +import os +import os.path as osp +import sys +import shutil +import xml.etree.ElementTree as ET +from tqdm import tqdm +import re + +import numpy as np +import PIL.ImageDraw + +label_to_num = {} +categories_list = [] +labels_list = [] + + +class MyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(MyEncoder, self).default(obj) + + +def images_labelme(data, num): + image = {} + image['height'] = data['imageHeight'] + image['width'] = data['imageWidth'] + image['id'] = num + 1 + if '\\' in data['imagePath']: + image['file_name'] = data['imagePath'].split('\\')[-1] + else: + image['file_name'] = data['imagePath'].split('/')[-1] + return image + + +def images_cityscape(data, num, img_file): + image = {} + image['height'] = data['imgHeight'] + image['width'] = data['imgWidth'] + image['id'] = num + 1 + image['file_name'] = img_file + return image + + +def categories(label, labels_list): + category = {} + category['supercategory'] = 'component' + category['id'] = len(labels_list) + 1 + category['name'] = label + return category + + +def annotations_rectangle(points, label, image_num, object_num, label_to_num): + annotation = {} + seg_points = np.asarray(points).copy() + seg_points[1, :] = np.asarray(points)[2, :] + seg_points[2, :] = np.asarray(points)[1, :] + annotation['segmentation'] = [list(seg_points.flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list( + map(float, [ + points[0][0], points[0][1], points[1][0] - points[0][0], points[1][ + 1] - points[0][1] + ])) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def annotations_polygon(height, width, points, label, image_num, object_num, + label_to_num): + annotation = {} + annotation['segmentation'] = [list(np.asarray(points).flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list(map(float, get_bbox(height, width, points))) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def get_bbox(height, width, points): + polygons = points + mask = np.zeros([height, width], dtype=np.uint8) + mask = PIL.Image.fromarray(mask) + xy = list(map(tuple, polygons)) + PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1) + mask = np.array(mask, dtype=bool) + index = np.argwhere(mask == 1) + rows = index[:, 0] + clos = index[:, 1] + left_top_r = np.min(rows) + left_top_c = np.min(clos) + right_bottom_r = np.max(rows) + right_bottom_c = np.max(clos) + return [ + left_top_c, left_top_r, right_bottom_c - left_top_c, + right_bottom_r - left_top_r + ] + + +def deal_json(ds_type, img_path, json_path): + data_coco = {} + images_list = [] + annotations_list = [] + image_num = -1 + object_num = -1 + for img_file in os.listdir(img_path): + img_label = os.path.splitext(img_file)[0] + if img_file.split('.')[ + -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']: + continue + label_file = osp.join(json_path, img_label + '.json') + print('Generating dataset from:', label_file) + image_num = image_num + 1 + with open(label_file) as f: + data = json.load(f) + if ds_type == 'labelme': + images_list.append(images_labelme(data, image_num)) + elif ds_type == 'cityscape': + images_list.append(images_cityscape(data, image_num, img_file)) + if ds_type == 'labelme': + for shapes in data['shapes']: + object_num = object_num + 1 + label = shapes['label'] + if label not in labels_list: + categories_list.append(categories(label, labels_list)) + labels_list.append(label) + label_to_num[label] = len(labels_list) + p_type = shapes['shape_type'] + if p_type == 'polygon': + points = shapes['points'] + annotations_list.append( + annotations_polygon(data['imageHeight'], data[ + 'imageWidth'], points, label, image_num, + object_num, label_to_num)) + + if p_type == 'rectangle': + (x1, y1), (x2, y2) = shapes['points'] + x1, x2 = sorted([x1, x2]) + y1, y2 = sorted([y1, y2]) + points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]] + annotations_list.append( + annotations_rectangle(points, label, image_num, + object_num, label_to_num)) + elif ds_type == 'cityscape': + for shapes in data['objects']: + object_num = object_num + 1 + label = shapes['label'] + if label not in labels_list: + categories_list.append(categories(label, labels_list)) + labels_list.append(label) + label_to_num[label] = len(labels_list) + points = shapes['polygon'] + annotations_list.append( + annotations_polygon(data['imgHeight'], data[ + 'imgWidth'], points, label, image_num, object_num, + label_to_num)) + data_coco['images'] = images_list + data_coco['categories'] = categories_list + data_coco['annotations'] = annotations_list + return data_coco + + +def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path): + with open(labels_path, 'r') as f: + labels_str = f.read().split() + labels_ids = list(range(1, len(labels_str) + 1)) + + with open(ann_ids_path, 'r') as f: + ann_ids = f.read().split() + ann_paths = [] + for aid in ann_ids: + if aid.endswith('xml'): + ann_path = os.path.join(ann_dir_path, aid) + else: + ann_path = os.path.join(ann_dir_path, aid + '.xml') + ann_paths.append(ann_path) + + return dict(zip(labels_str, labels_ids)), ann_paths + + +def voc_get_image_info(annotation_root, im_id): + filename = annotation_root.findtext('filename') + assert filename is not None + img_name = os.path.basename(filename) + + size = annotation_root.find('size') + width = float(size.findtext('width')) + height = float(size.findtext('height')) + + image_info = { + 'file_name': filename, + 'height': height, + 'width': width, + 'id': im_id + } + return image_info + + +def voc_get_coco_annotation(obj, label2id): + label = obj.findtext('name') + assert label in label2id, "label is not in label2id." + category_id = label2id[label] + bndbox = obj.find('bndbox') + xmin = float(bndbox.findtext('xmin')) + ymin = float(bndbox.findtext('ymin')) + xmax = float(bndbox.findtext('xmax')) + ymax = float(bndbox.findtext('ymax')) + assert xmax > xmin and ymax > ymin, "Box size error." + o_width = xmax - xmin + o_height = ymax - ymin + anno = { + 'area': o_width * o_height, + 'iscrowd': 0, + 'bbox': [xmin, ymin, o_width, o_height], + 'category_id': category_id, + 'ignore': 0, + } + return anno + + +def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file): + output_json_dict = { + "images": [], + "type": "instances", + "annotations": [], + "categories": [] + } + bnd_id = 1 # bounding box start id + im_id = 0 + print('Start converting !') + for a_path in tqdm(annotation_paths): + # Read annotation xml + ann_tree = ET.parse(a_path) + ann_root = ann_tree.getroot() + + img_info = voc_get_image_info(ann_root, im_id) + output_json_dict['images'].append(img_info) + + for obj in ann_root.findall('object'): + ann = voc_get_coco_annotation(obj=obj, label2id=label2id) + ann.update({'image_id': im_id, 'id': bnd_id}) + output_json_dict['annotations'].append(ann) + bnd_id = bnd_id + 1 + im_id += 1 + + for label, label_id in label2id.items(): + category_info = {'supercategory': 'none', 'id': label_id, 'name': label} + output_json_dict['categories'].append(category_info) + output_file = os.path.join(output_dir, output_file) + with open(output_file, 'w') as f: + output_json = json.dumps(output_json_dict) + f.write(output_json) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--dataset_type', + help='the type of dataset, can be `voc`, `labelme` or `cityscape`') + parser.add_argument('--json_input_dir', help='input annotated directory') + parser.add_argument('--image_input_dir', help='image directory') + parser.add_argument( + '--output_dir', help='output dataset directory', default='./') + parser.add_argument( + '--train_proportion', + help='the proportion of train dataset', + type=float, + default=1.0) + parser.add_argument( + '--val_proportion', + help='the proportion of validation dataset', + type=float, + default=0.0) + parser.add_argument( + '--test_proportion', + help='the proportion of test dataset', + type=float, + default=0.0) + parser.add_argument( + '--voc_anno_dir', + help='In Voc format dataset, path to annotation files directory.', + type=str, + default=None) + parser.add_argument( + '--voc_anno_list', + help='In Voc format dataset, path to annotation files ids list.', + type=str, + default=None) + parser.add_argument( + '--voc_label_list', + help='In Voc format dataset, path to label list. The content of each line is a category.', + type=str, + default=None) + parser.add_argument( + '--voc_out_name', + type=str, + default='voc.json', + help='In Voc format dataset, path to output json file') + args = parser.parse_args() + try: + assert args.dataset_type in ['voc', 'labelme', 'cityscape'] + except AssertionError as e: + print( + 'Now only support the voc, cityscape dataset and labelme dataset!!') + os._exit(0) + + if args.dataset_type == 'voc': + assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list + label2id, ann_paths = voc_get_label_anno( + args.voc_anno_dir, args.voc_anno_list, args.voc_label_list) + voc_xmls_to_cocojson( + annotation_paths=ann_paths, + label2id=label2id, + output_dir=args.output_dir, + output_file=args.voc_out_name) + else: + try: + assert os.path.exists(args.json_input_dir) + except AssertionError as e: + print('The json folder does not exist!') + os._exit(0) + try: + assert os.path.exists(args.image_input_dir) + except AssertionError as e: + print('The image folder does not exist!') + os._exit(0) + try: + assert abs(args.train_proportion + args.val_proportion \ + + args.test_proportion - 1.0) < 1e-5 + except AssertionError as e: + print( + 'The sum of pqoportion of training, validation and test datase must be 1!' + ) + os._exit(0) + + # Allocate the dataset. + total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json'))) + if args.train_proportion != 0: + train_num = int(total_num * args.train_proportion) + out_dir = args.output_dir + '/train' + if not os.path.exists(out_dir): + os.makedirs(out_dir) + else: + train_num = 0 + if args.val_proportion == 0.0: + val_num = 0 + test_num = total_num - train_num + out_dir = args.output_dir + '/test' + if args.test_proportion != 0.0 and not os.path.exists(out_dir): + os.makedirs(out_dir) + else: + val_num = int(total_num * args.val_proportion) + test_num = total_num - train_num - val_num + val_out_dir = args.output_dir + '/val' + if not os.path.exists(val_out_dir): + os.makedirs(val_out_dir) + test_out_dir = args.output_dir + '/test' + if args.test_proportion != 0.0 and not os.path.exists(test_out_dir): + os.makedirs(test_out_dir) + count = 1 + for img_name in os.listdir(args.image_input_dir): + if count <= train_num: + if osp.exists(args.output_dir + '/train/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/train/', img_name)) + else: + if count <= train_num + val_num: + if osp.exists(args.output_dir + '/val/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/val/', img_name)) + else: + if osp.exists(args.output_dir + '/test/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/test/', img_name)) + count = count + 1 + + # Deal with the json files. + if not os.path.exists(args.output_dir + '/annotations'): + os.makedirs(args.output_dir + '/annotations') + if args.train_proportion != 0: + train_data_coco = deal_json(args.dataset_type, + args.output_dir + '/train', + args.json_input_dir) + train_json_path = osp.join(args.output_dir + '/annotations', + 'instance_train.json') + json.dump( + train_data_coco, + open(train_json_path, 'w'), + indent=4, + cls=MyEncoder) + if args.val_proportion != 0: + val_data_coco = deal_json(args.dataset_type, + args.output_dir + '/val', + args.json_input_dir) + val_json_path = osp.join(args.output_dir + '/annotations', + 'instance_val.json') + json.dump( + val_data_coco, + open(val_json_path, 'w'), + indent=4, + cls=MyEncoder) + if args.test_proportion != 0: + test_data_coco = deal_json(args.dataset_type, + args.output_dir + '/test', + args.json_input_dir) + test_json_path = osp.join(args.output_dir + '/annotations', + 'instance_test.json') + json.dump( + test_data_coco, + open(test_json_path, 'w'), + indent=4, + cls=MyEncoder) + + +if __name__ == '__main__': + main()