diff --git a/mmdetection3d/projects/BEVFusion/__init__.py b/mmdetection3d/projects/BEVFusion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mmdetection3d/projects/CMT/nuscenes_corrupt.py b/mmdetection3d/projects/CMT/nuscenes_corrupt.py index 80c2faf..5cc92c3 100644 --- a/mmdetection3d/projects/CMT/nuscenes_corrupt.py +++ b/mmdetection3d/projects/CMT/nuscenes_corrupt.py @@ -9,7 +9,9 @@ from mmdet3d.evaluation.metrics.nuscenes_metric import NuScenesMetric from scene_split import train_day, train_night, train_rain, train_dry, val_day, val_night, val_rain, val_dry, train, val - +from mmengine import Config, load +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger from nuscenes.nuscenes import NuScenes @@ -742,6 +744,61 @@ def __init__(self, ann_file=ann_file, metric=metric, **kwargs) + + def compute_metrics(self, results: List[dict]) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (List[dict]): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + print('computing metrics for split:', self.split) + logger: MMLogger = MMLogger.get_current_instance() + + classes = self.dataset_meta['classes'] + self.version = self.dataset_meta['version'] + # load annotations + self.data_infos = load( + self.ann_file, backend_args=self.backend_args)['data_list'] + # filter out data info not in the split if split is specified + if self.split is not None: + filtered_data_infos = [] + scene_list = self.SPLIT_SCENES[self.split] + for info in self.data_infos: + # get scene token from sample token + sample_token = info['token'] + sample = self.nusc.get('sample', sample_token) + scene_token = sample['scene_token'] + scene_name = self.nusc.get('scene', scene_token)['name'] + if scene_name in scene_list: + filtered_data_infos.append(info) + self.data_infos = filtered_data_infos + + + result_dict, tmp_dir = self.format_results(results, classes, + self.jsonfile_prefix) + + metric_dict = {} + + if self.format_only: + logger.info( + f'results are saved in {osp.basename(self.jsonfile_prefix)}') + return metric_dict + + for metric in self.metrics: + ap_dict = self.nus_evaluate( + result_dict, classes=classes, metric=metric, logger=logger) + for result in ap_dict: + metric_dict[result] = ap_dict[result] + + if tmp_dir is not None: + tmp_dir.cleanup() + return metric_dict + + def _evaluate_single(self, result_path: str, diff --git a/mmdetection3d/projects/TransFusion/README.md b/mmdetection3d/projects/TransFusion/README.md new file mode 100644 index 0000000..1b29653 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/README.md @@ -0,0 +1,3 @@ +# TransFusion +* modernized TransfusionConfig +* check example_scripts for training code. \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/__init__.py b/mmdetection3d/projects/TransFusion/__init__.py new file mode 100644 index 0000000..3fee9f0 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/__init__.py @@ -0,0 +1,12 @@ +from .transfusion import TransFusionDetector +from .transfusion_head import TransFusion_TransFusionHead +from .transform3d import MyImgToList, MyNormalize, MyPad, MyResize + +__all__ = [ + 'TransFusionDetector', + 'TransFusion_TransFusionHead', + 'MyImgToList', + 'MyNormalize', + 'MyResize', + 'MyPad', +] \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_L.py b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_L.py new file mode 100644 index 0000000..f86a5e6 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_L.py @@ -0,0 +1,337 @@ +# Phase one trains LIDAR + + +_base_ = [ + '../../../configs/_base_/default_runtime.py', +] +custom_imports = dict( + imports=['projects.TransFusion', 'projects.BEVFusion.bevfusion'], allow_failed_imports=False) + +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +voxel_size = [0.2, 0.2, 8] # orignal configs +grid_size = [512, 512, 1] +out_size_factor = 4 +dataset_type = 'NuScenesDataset' +data_root = '/data/nuscenes/' + +lr = 0.0001 + +backend_args = None +metainfo = dict(classes=class_names) +data_prefix = dict(pts='samples/LIDAR_TOP', sweeps='sweeps/LIDAR_TOP') + +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='ObjectSample', + db_sampler=dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ))), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.9, 1.1], + translation_std=[0.5, 0.5, 0.5]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict( + type='Pack3DDetInputs', + keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']), +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='Pack3DDetInputs', + keys=['points'], + meta_keys=[ + 'box_type_3d', 'sample_idx', 'lidar_path', + 'num_pts_feats', + ]), +] +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + use_valid_flag=True, + box_type_3d='LiDAR'))) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + data_prefix=data_prefix, + test_mode=True, + box_type_3d='LiDAR', + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') +model = dict( + type='TransFusionDetector', + data_preprocessor=dict( + type='Det3DDataPreprocessor', + pad_size_divisor=32, + voxel=True, + voxel_type='hard', + voxel_layer=dict( + max_num_points=20, + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(30000, 60000), + )), + pts_voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), + point_cloud_range=point_cloud_range, + ), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=(512, 512) + ), + pts_backbone=dict( + type='SECOND', + in_channels=64, + out_channels=[64, 128, 256], + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + out_channels=[128, 128, 128], + upsample_strides=[0.5, 1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='TransFusion_TransFusionHead', + num_proposals=200, + auxiliary=True, + in_channels=128 * 3, + hidden_channel=128, + num_classes=len(class_names), + num_decoder_layers=1, + num_heads=8, + ffn_channel=256, + dropout=0.1, + activation='relu', + common_heads=dict( + center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), + bbox_coder=dict( + type='TransFusionBBoxCoder', + pc_range=point_cloud_range[:2], + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + score_threshold=0.0, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + code_size=10), + loss_cls=dict( + type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0), + #loss_iou=dict( this is not actually used + # type='mmdet.VarifocalLoss', use_sigmoid=True, iou_weighted=True, reduction='mean'), + loss_heatmap=dict( + type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), + train_cfg=dict( + pts=dict( + dataset='nuScenes', + point_cloud_range=point_cloud_range, + grid_size=grid_size, + voxel_size=voxel_size, + out_size_factor=out_size_factor, + gaussian_overlap=0.1, + min_radius=2, + pos_weight=-1, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + assigner=dict( + type='HungarianAssigner3D', + iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), + cls_cost=dict( + type='mmdet.FocalLossCost', + gamma=2.0, + alpha=0.25, + weight=0.15), + reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), + iou_cost=dict(type='IoU3DCost', weight=0.25)))), + test_cfg=dict( + pts=dict( + dataset='nuScenes', + grid_size=grid_size, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + pc_range=point_cloud_range[:2], + nms_type=None)), +) + +auto_scale_lr = dict(enable=False, base_batch_size=8) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + # learning rate scheduler + # During the first 8 epochs, learning rate increases from 0 to lr * 10 + # during the next 12 epochs, learning rate decreases from lr * 10 to + # lr * 1e-4 + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 10, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=12, + eta_min=lr * 1e-4, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), + # momentum scheduler + dict( + type='CosineAnnealingMomentum', + T_max=8, + eta_min=0.85 / 0.95, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=12, + eta_min=1, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), +] + +train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5) +val_cfg = dict() +test_cfg = dict() +log_processor = dict(window_size=50) + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50), + checkpoint=dict(type='CheckpointHook', interval=1)) diff --git a/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_LC.py b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_LC.py new file mode 100644 index 0000000..c4864e6 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_LC.py @@ -0,0 +1,360 @@ +# Phase 2 Fusion +_base_ = [ + '../../../configs/_base_/default_runtime.py', +] +custom_imports = dict( + imports=['projects.TransFusion', 'projects.BEVFusion.bevfusion'], allow_failed_imports=False) + +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +voxel_size = [0.2, 0.2, 8] +grid_size = [512, 512, 1] +out_size_factor = 4 +dataset_type = 'NuScenesDataset' +data_root = '/data/nuscenes/' + + +lr = 0.0001 + +backend_args = None +metainfo = dict(classes=class_names) +data_prefix = dict( + pts='samples/LIDAR_TOP', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT', + sweeps='sweeps/LIDAR_TOP') + +input_modality = dict( + use_lidar=True, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) + +# ================= Image Configs ================= +num_views = 6 +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +img_scale = (800, 448) + +# ================= Pipelines ================= +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles',num_views=num_views, backend_args=backend_args), + dict(type='MyComputeMultiViewCalib'), + dict(type='MyNormalize', **img_norm_cfg), + dict(type='MyResize', scale=img_scale, keep_ratio=True), + dict(type='MyPad', size_divisor=32), + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.9, 1.1], + translation_std=[0.5, 0.5, 0.5]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict(type='MyImgToList'), + dict( + type='Pack3DDetInputs', + keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'], + meta_keys=[ + 'box_type_3d', 'sample_idx', 'lidar_path', + 'num_pts_feats', 'lidar2img', 'lidar2cam', 'cam2img', 'cam2lidar', 'ori_lidar2img', 'ori_cam2img', + 'img_shape', 'input_shape', 'scale_factor', 'flip', 'img_crop_offset', + ], + ), +] + +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadMultiViewImageFromFiles', num_views=num_views, backend_args=backend_args), # Added + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='MyComputeMultiViewCalib'), + dict(type='MyNormalize', **img_norm_cfg), + dict(type='MyResize', scale=img_scale, keep_ratio=True), + dict(type='MyPad', size_divisor=32), + dict(type='MyImgToList'), + dict( + type='Pack3DDetInputs', + keys=['points', 'img'], # Added 'img' + meta_keys=[ + 'box_type_3d', 'sample_idx', 'lidar_path', + 'num_pts_feats', 'lidar2img', 'img_shape', 'input_shape', + 'scale_factor', 'flip', 'img_crop_offset', + ]), +] + +# ================= Data Loaders ================= +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + use_valid_flag=True, + box_type_3d='LiDAR'))) + +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + data_prefix=data_prefix, + test_mode=True, + box_type_3d='LiDAR', + backend_args=backend_args)) +test_dataloader = val_dataloader +model = dict( + type='TransFusionDetector', + freeze_img=True, # Shopuld we train a bit? + img_backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + img_neck=dict( + type='mmdet.FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + + data_preprocessor=dict( + type='Det3DDataPreprocessor', + pad_size_divisor=32, + voxel=True, + voxel_type='hard', + mean=img_norm_cfg['mean'], + std=img_norm_cfg['std'], + voxel_layer=dict( + max_num_points=20, + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(30000, 60000), + )), + pts_voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), + point_cloud_range=point_cloud_range, + ), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=(512, 512) + ), + pts_backbone=dict( + type='SECOND', + in_channels=64, + out_channels=[64, 128, 256], + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + out_channels=[128, 128, 128], + upsample_strides=[0.5, 1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='TransFusion_TransFusionHead', + num_proposals=200, + auxiliary=True, + in_channels=128 * 3, + hidden_channel=128, + num_classes=len(class_names), + num_decoder_layers=1, + fuse_img=True, + num_views=num_views, + in_channels_img=256, # Matches FPN out_channels + out_size_factor_img=4, # Default for ResNet+FPN + initialize_by_heatmap=True, # Standard for TransFusion + nms_kernel_size=3, + num_heads=8, + ffn_channel=256, + dropout=0.1, + activation='relu', + common_heads=dict( + center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), + bbox_coder=dict( + type='TransFusionBBoxCoder', + pc_range=point_cloud_range[:2], + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + score_threshold=0.0, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + code_size=10), + loss_cls=dict( + type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0), + loss_heatmap=dict( + type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), + train_cfg=dict( + pts=dict( + dataset='nuScenes', + point_cloud_range=point_cloud_range, + grid_size=grid_size, + voxel_size=voxel_size, + out_size_factor=out_size_factor, + gaussian_overlap=0.1, + min_radius=2, + pos_weight=-1, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + assigner=dict( + type='HungarianAssigner3D', + iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), + cls_cost=dict( + type='mmdet.FocalLossCost', + gamma=2.0, + alpha=0.25, + weight=0.15), + reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), + iou_cost=dict(type='IoU3DCost', weight=0.25)))), + test_cfg=dict( + pts=dict( + dataset='nuScenes', + grid_size=grid_size, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + pc_range=point_cloud_range[:2], + nms_type=None)), +) + +#load_from = 'work_dirs/transfusion_lidar/epoch_20.pth' # change this to phase 1 checkpoint + +auto_scale_lr = dict(enable=False, base_batch_size=8) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.01), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=2, + eta_min=lr * 10, + begin=0, + end=2, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min=lr * 1e-4, + begin=2, + end=6, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=2, + eta_min=0.85 / 0.95, + begin=0, + end=2, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=4, + eta_min=1, + begin=2, + end=6, + by_epoch=True, + convert_to_iter_based=True), +] + +train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) +val_cfg = dict() +test_cfg = dict() +log_processor = dict(window_size=50) + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50), + checkpoint=dict(type='CheckpointHook', interval=1)) + +val_evaluator = dict( + type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') + + +#freeze_lidar_components = True noone really consumes this +find_unused_parameters = True diff --git a/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_L.py b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_L.py new file mode 100644 index 0000000..78ddcb1 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_L.py @@ -0,0 +1,340 @@ +# Phase one trains LIDAR + +_base_ = [ + '../../../configs/_base_/default_runtime.py', +] +custom_imports = dict( + imports=['projects.TransFusion', 'projects.BEVFusion.bevfusion'], allow_failed_imports=False) + +point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +voxel_size = [0.075, 0.075, 0.2] +grid_size = [1440, 1440, 40] # (54+54)/0.075=1440, (5+3)/0.2=40 +out_size_factor = 8 +dataset_type = 'NuScenesDataset' +data_root = '/data/nuscenes/' + +lr = 0.0001 + +backend_args = None +metainfo = dict(classes=class_names) +data_prefix = dict(pts='samples/LIDAR_TOP', sweeps='sweeps/LIDAR_TOP') + +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='ObjectSample', + db_sampler=dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ))), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925 * 2, 0.3925 * 2], # kept original voxel rotation range + scale_ratio_range=[0.9, 1.1], + translation_std=[0.5, 0.5, 0.5]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + # Pack3DDetInputs replaces DefaultFormatBundle3D + Collect3D + dict( + type='Pack3DDetInputs', + keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']), +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='Pack3DDetInputs', + keys=['points'], + meta_keys=[ + 'box_type_3d', 'sample_idx', 'lidar_path', + 'num_pts_feats', + ]), +] +# Replaced data dict with separate dataloader dicts +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + use_valid_flag=True, + box_type_3d='LiDAR'))) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + data_prefix=data_prefix, + test_mode=True, + box_type_3d='LiDAR', + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') +model = dict( + type='TransFusionDetector', + data_preprocessor=dict( + type='Det3DDataPreprocessor', + pad_size_divisor=32, + voxel=True, + voxel_type='hard', + voxel_layer=dict( + max_num_points=10, + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(120000, 160000), + )), + pts_voxel_encoder=dict( + type='HardSimpleVFE', + num_features=5, + ), + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[41, 1440, 1440], + output_channels=128, + order=('conv', 'norm', 'act'), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), + encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), + block_type='basicblock'), + pts_backbone=dict( + type='SECOND', + in_channels=256, + out_channels=[128, 256], + layer_nums=[5, 5], + layer_strides=[1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + out_channels=[256, 256], + upsample_strides=[1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='TransFusion_TransFusionHead', + num_proposals=200, + auxiliary=True, + in_channels=256 * 2, + hidden_channel=128, + num_classes=len(class_names), + num_decoder_layers=1, + num_heads=8, + ffn_channel=256, + dropout=0.1, + activation='relu', + common_heads=dict( + center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), + bbox_coder=dict( + type='TransFusionBBoxCoder', + pc_range=point_cloud_range[:2], + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + score_threshold=0.0, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + code_size=10), + loss_cls=dict( + type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0), + loss_heatmap=dict( + type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), + train_cfg=dict( + pts=dict( + dataset='nuScenes', + point_cloud_range=point_cloud_range, + grid_size=grid_size, + voxel_size=voxel_size, + out_size_factor=out_size_factor, + gaussian_overlap=0.1, + min_radius=2, + pos_weight=-1, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + assigner=dict( + type='HungarianAssigner3D', + iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), + cls_cost=dict( + type='mmdet.FocalLossCost', + gamma=2.0, + alpha=0.25, + weight=0.15), + reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), + iou_cost=dict(type='IoU3DCost', weight=0.25)))), + test_cfg=dict( + pts=dict( + dataset='nuScenes', + grid_size=grid_size, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + pc_range=point_cloud_range[:2], + nms_type=None)), +) + +auto_scale_lr = dict(enable=False, base_batch_size=8) + +# Replaced optimizer/optimizer_config with optim_wrapper +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +param_scheduler = [ + # learning rate scheduler + # During the first 8 epochs, learning rate increases from 0 to lr * 10 + # during the next 12 epochs, learning rate decreases from lr * 10 to + # lr * 1e-4 + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 10, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=12, + eta_min=lr * 1e-4, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), + # momentum scheduler + dict( + type='CosineAnnealingMomentum', + T_max=8, + eta_min=0.85 / 0.95, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=12, + eta_min=1, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), +] + +# Replaced total_epochs with train_cfg +train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5) +val_cfg = dict() +test_cfg = dict() +log_processor = dict(window_size=50) + +# Replaced checkpoint_config/log_config with default_hooks +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50), + checkpoint=dict(type='CheckpointHook', interval=1)) \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_LC.py b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_LC.py new file mode 100644 index 0000000..7f48dd6 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_LC.py @@ -0,0 +1,356 @@ +# Phase 2 Fusion: finetune with camera on top of voxel_L checkpoint + +_base_ = [ + '../../../configs/_base_/default_runtime.py', +] +custom_imports = dict( + imports=['projects.TransFusion', 'projects.BEVFusion.bevfusion'], allow_failed_imports=False) + +point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +voxel_size = [0.075, 0.075, 0.2] +grid_size = [1440, 1440, 40] # (54+54)/0.075=1440, (5+3)/0.2=40 +out_size_factor = 8 +dataset_type = 'NuScenesDataset' +data_root = '/data/nuscenes/' + +lr = 0.0001 + +backend_args = None +metainfo = dict(classes=class_names) +# Added camera paths for fusion +data_prefix = dict( + pts='samples/LIDAR_TOP', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT', + sweeps='sweeps/LIDAR_TOP') + +# use_camera=True for fusion +input_modality = dict( + use_lidar=True, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) + +# ================= Image Configs ================= +img_scale = (800, 448) +num_views = 6 +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# ================= Pipelines ================= +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + # Added for camera fusion + dict(type='LoadMultiViewImageFromFiles', backend_args=backend_args), + + + # dict( + # type='GlobalRotScaleTrans', + # rot_range=[-0.3925 * 2, 0.3925 * 2], + # scale_ratio_range=[0.9, 1.1], + # translation_std=[0.5, 0.5, 0.5]), + # dict( + # type='RandomFlip3D', + # sync_2d=True, + # flip_ratio_bev_horizontal=0.5, + # flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + # Image preprocessing + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + # Added 'img' key for camera fusion + dict( + type='Pack3DDetInputs', + keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']), +] + +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args, + ), + dict(type='LoadMultiViewImageFromFiles', backend_args=backend_args), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + # Image preprocessing + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='Pack3DDetInputs', + keys=['points', 'img'], + meta_keys=[ + 'box_type_3d', 'sample_idx', 'lidar_path', + 'num_pts_feats', 'lidar2img', # lidar2img needed for fusion + ]), +] + +# ================= Data Loaders ================= +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + use_valid_flag=True, + box_type_3d='LiDAR'))) + +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + data_prefix=data_prefix, + test_mode=True, + box_type_3d='LiDAR', + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +model = dict( + type='TransFusionDetector', + freeze_img=True, # freeze image backbone during fusion finetuning + img_backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + img_neck=dict( + type='mmdet.FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + data_preprocessor=dict( + type='Det3DDataPreprocessor', + pad_size_divisor=32, + voxel=True, + voxel_type='hard', + mean=img_norm_cfg['mean'], + std=img_norm_cfg['std'], + voxel_layer=dict( + max_num_points=10, + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(120000, 160000), + )), + pts_voxel_encoder=dict( + type='HardSimpleVFE', + num_features=5, + ), + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[41, 1440, 1440], + output_channels=128, + order=('conv', 'norm', 'act'), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), + encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), + block_type='basicblock'), + pts_backbone=dict( + type='SECOND', + in_channels=256, + out_channels=[128, 256], + layer_nums=[5, 5], + layer_strides=[1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + out_channels=[256, 256], + upsample_strides=[1, 2], + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='TransFusion_TransFusionHead', + num_proposals=200, + auxiliary=True, + in_channels=256 * 2, + hidden_channel=128, + num_classes=len(class_names), + num_decoder_layers=1, + fuse_img=True, + num_views=num_views, + in_channels_img=256, # matches FPN out_channels + out_size_factor_img=4, # default for ResNet+FPN + initialize_by_heatmap=True, + num_heads=8, + ffn_channel=256, + dropout=0.1, + activation='relu', + common_heads=dict( + center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), + bbox_coder=dict( + type='TransFusionBBoxCoder', + pc_range=point_cloud_range[:2], + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + score_threshold=0.0, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + code_size=10), + loss_cls=dict( + type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0), + # loss_iou=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0), + loss_heatmap=dict( + type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), + train_cfg=dict( + pts=dict( + dataset='nuScenes', + point_cloud_range=point_cloud_range, + grid_size=grid_size, + voxel_size=voxel_size, + out_size_factor=out_size_factor, + gaussian_overlap=0.1, + min_radius=2, + pos_weight=-1, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + assigner=dict( + type='HungarianAssigner3D', + iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), + cls_cost=dict( + type='mmdet.FocalLossCost', + gamma=2.0, + alpha=0.25, + weight=0.15), + reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), + iou_cost=dict(type='IoU3DCost', weight=0.25)))), + test_cfg=dict( + pts=dict( + dataset='nuScenes', + grid_size=grid_size, + out_size_factor=out_size_factor, + voxel_size=voxel_size[:2], + pc_range=point_cloud_range[:2], + nms_type=None)), +) + +# Load phase 1 voxel_L checkpoint +load_from = 'work_dirs/transfusion_lidar/epoch_20.pth' + +auto_scale_lr = dict(enable=False, base_batch_size=8) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +# 6 epochs for fusion finetuning (shorter than phase 1) +param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=2, + eta_min=lr * 10, + begin=0, + end=2, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min=lr * 1e-4, + begin=2, + end=6, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=2, + eta_min=0.85 / 0.95, + begin=0, + end=2, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=4, + eta_min=1, + begin=2, + end=6, + by_epoch=True, + convert_to_iter_based=True), +] + +train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) +val_cfg = dict() +test_cfg = dict() +log_processor = dict(window_size=50) + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50), + checkpoint=dict(type='CheckpointHook', interval=1)) diff --git a/mmdetection3d/projects/TransFusion/example_scripts/train_pillar.sh b/mmdetection3d/projects/TransFusion/example_scripts/train_pillar.sh new file mode 100644 index 0000000..4352652 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/example_scripts/train_pillar.sh @@ -0,0 +1,53 @@ +#!/bin/bash -l +#SBATCH --nodes=1 # Allocate *at least* 1 node to this job. +#SBATCH --ntasks=1 # Allocate *at most* 1 task for job steps in the job +#SBATCH --cpus-per-task=16 # Each task needs only one CPU +#SBATCH --mem=192gb # This particular job won't need much memory +#SBATCH --time=4-00:01:00 # 4 days and 1 minute +#SBATCH --job-name="trans_PL" +#SBATCH -p batch # You could pick other partitions for other jobs +#SBATCH --wait-all-nodes=1 # Run once all resources are available +#SBATCH --output=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/logs/transfusion_PL.txt + +date +nvidia-smi + + +## Navigate to mmdetection3d root +cd /data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d +pwd +## Set path +export PYTHONPATH=$(pwd):$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/BEVFusion:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion:$PYTHONPATH +# GPU Settings +export CUDA_VISIBLE_DEVICES=0,1 +NUM_GPUS=2 + +WORKDIR=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/work_dir/transfusion_lidar +export PORT=$((29500 + RANDOM % 1000)) + +CFG=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_L.py + +bash tools/dist_train.sh \ + $CFG \ + $NUM_GPUS \ + --work-dir $WORKDIR \ + --cfg-options \ + "train_dataloader.batch_size=16" \ + "train_dataloader.num_workers=32" \ + "val_dataloader.batch_size=4" \ + "val_dataloader.num_workers=8" \ + "default_hooks.checkpoint.out_dir=${WORKDIR}" \ + "data_root=/data/HangQiu/data/nuscenes/" \ + "train_dataloader.dataset.dataset.pipeline.3.db_sampler.data_root=/data/HangQiu/data/nuscenes/" \ + "train_dataloader.dataset.dataset.pipeline.3.db_sampler.info_path=/data/HangQiu/data/nuscenes/nuscenes_dbinfos_train.pkl" \ + "train_dataloader.dataset.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "test_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ + "test_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "test_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/example_scripts/train_pillar2.sh b/mmdetection3d/projects/TransFusion/example_scripts/train_pillar2.sh new file mode 100644 index 0000000..b74b685 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/example_scripts/train_pillar2.sh @@ -0,0 +1,51 @@ +#!/bin/bash -l +#SBATCH --nodes=1 # Allocate *at least* 1 node to this job. +#SBATCH --ntasks=1 # Allocate *at most* 1 task for job steps in the job +#SBATCH --cpus-per-task=16 # Each task needs only one CPU +#SBATCH --mem=192gb # This particular job won't need much memory +#SBATCH --time=4-00:01:00 # 4 days and 1 minute +#SBATCH --job-name="trans_PLC" +#SBATCH -p batch # You could pick other partitions for other jobs +#SBATCH --wait-all-nodes=1 # Run once all resources are available +#SBATCH --output=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/logs/transfusion_PLC.txt + +date +nvidia-smi + + +## Navigate to mmdetection3d root +cd /data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d +pwd +## Set path +export PYTHONPATH=$(pwd):$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/BEVFusion:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion:$PYTHONPATH +# GPU Settings +export CUDA_VISIBLE_DEVICES=0,1 +NUM_GPUS=2 + +WORKDIR=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/work_dir/transfusion_pillar_LC +export PORT=$((29500 + RANDOM % 1000)) + +CFG=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_LC.py + +bash tools/dist_train.sh \ + $CFG \ + $NUM_GPUS \ + --work-dir $WORKDIR \ + --cfg-options \ + "train_dataloader.batch_size=10" \ + "train_dataloader.num_workers=20" \ + "val_dataloader.batch_size=4" \ + "val_dataloader.num_workers=8" \ + "default_hooks.checkpoint.out_dir=${WORKDIR}" \ + "data_root=/data/HangQiu/data/nuscenes/" \ + "train_dataloader.dataset.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "test_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ + "test_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "test_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/example_scripts/train_voxel.sh b/mmdetection3d/projects/TransFusion/example_scripts/train_voxel.sh new file mode 100644 index 0000000..c77fe9e --- /dev/null +++ b/mmdetection3d/projects/TransFusion/example_scripts/train_voxel.sh @@ -0,0 +1,55 @@ +#!/bin/bash -l +#SBATCH --nodes=1 # Allocate *at least* 1 node to this job. +#SBATCH --ntasks=1 # Allocate *at most* 1 task for job steps in the job +#SBATCH --cpus-per-task=16 # Each task needs only one CPU +#SBATCH --mem=192gb # This particular job won't need much memory +#SBATCH --time=4-00:01:00 # 4 days and 1 minute +#SBATCH --job-name="trans_VL" +#SBATCH -p batch # You could pick other partitions for other jobs +#SBATCH --wait-all-nodes=1 # Run once all resources are available +#SBATCH --output=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/logs/transfusion_VL.txt + +date +nvidia-smi + + +## Navigate to mmdetection3d root +cd /data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d +pwd +## Set path +export PYTHONPATH=$(pwd):$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/BEVFusion:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion:$PYTHONPATH +# GPU Settings +export CUDA_VISIBLE_DEVICES=0,1 +NUM_GPUS=2 + +WORKDIR=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/work_dir/transfusion_voxel +export PORT=$((29500 + RANDOM % 1000)) + +CFG=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_voxel_L.py + +bash tools/dist_train.sh \ + $CFG \ + $NUM_GPUS \ + --work-dir $WORKDIR \ + --cfg-options \ + "train_dataloader.batch_size=16" \ + "train_dataloader.num_workers=32" \ + "val_dataloader.batch_size=4" \ + "val_dataloader.num_workers=8" \ + "default_hooks.checkpoint.out_dir=${WORKDIR}" \ + "data_root=/data/HangQiu/data/nuscenes/" \ + "db_sampler.data_root=/data/HangQiu/data/nuscenes/" \ + "db_sampler.info_path=/data/HangQiu/data/nuscenes/nuscenes_dbinfos_train.pkl" \ + "train_dataloader.dataset.dataset.pipeline.3.db_sampler.data_root=/data/HangQiu/data/nuscenes/" \ + "train_dataloader.dataset.dataset.pipeline.3.db_sampler.info_path=/data/HangQiu/data/nuscenes/nuscenes_dbinfos_train.pkl" \ + "train_dataloader.dataset.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "test_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ + "test_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "test_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/example_scripts/train_voxel2.sh b/mmdetection3d/projects/TransFusion/example_scripts/train_voxel2.sh new file mode 100644 index 0000000..5ee7405 --- /dev/null +++ b/mmdetection3d/projects/TransFusion/example_scripts/train_voxel2.sh @@ -0,0 +1,51 @@ +#!/bin/bash -l +#SBATCH --nodes=1 # Allocate *at least* 1 node to this job. +#SBATCH --ntasks=1 # Allocate *at most* 1 task for job steps in the job +#SBATCH --cpus-per-task=16 # Each task needs only one CPU +#SBATCH --mem=192gb # This particular job won't need much memory +#SBATCH --time=4-00:01:00 # 4 days and 1 minute +#SBATCH --job-name="trans_VLC" +#SBATCH -p batch # You could pick other partitions for other jobs +#SBATCH --wait-all-nodes=1 # Run once all resources are available +#SBATCH --output=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/logs/transfusion_VLC.txt + +date +nvidia-smi + + +## Navigate to mmdetection3d root +cd /data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d +pwd +## Set path +export PYTHONPATH=$(pwd):$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/BEVFusion:$PYTHONPATH +export PYTHONPATH=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion:$PYTHONPATH +# GPU Settings +export CUDA_VISIBLE_DEVICES=0,1 +NUM_GPUS=2 + +WORKDIR=/data/HangQiu/proj/mlsys/DynMMOT_scripts/transfusion/work_dir/transfusion_voxel_LC +export PORT=$((29500 + RANDOM % 1000)) + +CFG=/data/HangQiu/proj/mlsys/DynMMOT/mmdetection3d/projects/TransFusion/configs/transfusion_nusc_pillar_LC.py + +bash tools/dist_train.sh \ + $CFG \ + $NUM_GPUS \ + --work-dir $WORKDIR \ + --cfg-options \ + "train_dataloader.batch_size=8" \ + "train_dataloader.num_workers=16" \ + "val_dataloader.batch_size=4" \ + "val_dataloader.num_workers=8" \ + "default_hooks.checkpoint.out_dir=${WORKDIR}" \ + "data_root=/data/HangQiu/data/nuscenes/" \ + "train_dataloader.dataset.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "test_dataloader.dataset.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "val_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ + "test_evaluator.data_root=/data/HangQiu/data/nuscenes/" \ + "test_evaluator.ann_file=/data/HangQiu/data/nuscenes/nuscenes_infos_val.pkl" \ No newline at end of file diff --git a/mmdetection3d/projects/TransFusion/transform3d.py b/mmdetection3d/projects/TransFusion/transform3d.py new file mode 100644 index 0000000..d6cc5fb --- /dev/null +++ b/mmdetection3d/projects/TransFusion/transform3d.py @@ -0,0 +1,150 @@ +import copy + +import numpy as np +import mmcv +from mmcv.transforms import BaseTransform +from mmdet3d.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class MyComputeMultiViewCalib(BaseTransform): + """Compute multi-view calibration matrices from per-camera info in results['images']. + Some how we don't have lidar2img so this is necessary + The standard LoadMultiViewImageFromFiles and det3d_dataset only set lidar2img for the + default_cam_key (single view). For LC fusion the head needs (num_views, 4, 4) projection + matrices. This transform replicates BEVFusion's BEVLoadMultiViewImageFromFiles calibration + logic (cam2img, lidar2cam, cam2lidar, lidar2img) without modifying BEVFusion code. + + Added keys: + lidar2img (num_views, 4, 4): cam2img @ lidar2cam per view. + cam2img (num_views, 4, 4): 4x4 intrinsic (3x3 padded to 4x4) per view. + lidar2cam (num_views, 4, 4): extrinsic per view. + cam2lidar (num_views, 4, 4): inverse extrinsic per view. + ori_cam2img, ori_lidar2img: deep copies before any augmentation. + """ + + def transform(self, results: dict) -> dict: + cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [] + for _, cam_item in results['images'].items(): + l2c = np.array(cam_item['lidar2cam'], dtype=np.float32) + c2l = np.eye(4, dtype=np.float32) + c2l[:3, :3] = l2c[:3, :3].T + c2l[:3, 3:4] = -l2c[:3, :3].T @ l2c[:3, 3:4] + + c2i = np.eye(4, dtype=np.float32) + c2i[:3, :3] = np.array(cam_item['cam2img'], dtype=np.float32) + + lidar2cam.append(l2c) + cam2img.append(c2i) + cam2lidar.append(c2l) + lidar2img.append(c2i @ l2c) + + results['cam2img'] = np.stack(cam2img, axis=0) + results['lidar2cam'] = np.stack(lidar2cam, axis=0) + results['cam2lidar'] = np.stack(cam2lidar, axis=0) + results['lidar2img'] = np.stack(lidar2img, axis=0) + results['ori_cam2img'] = copy.deepcopy(results['cam2img']) + results['ori_lidar2img'] = copy.deepcopy(results['lidar2img']) + return results + + +@TRANSFORMS.register_module() +class MyImgToList(BaseTransform): + """Convert results['img'] from ndarray to a list of per-view arrays.""" + + def transform(self, results): + img = results['img'] + if not isinstance(img, list): + if img.ndim == 4: + results['img'] = [img[i] for i in range(img.shape[0])] + else: + results['img'] = [img] + return results + + +@TRANSFORMS.register_module() +class MyNormalize(BaseTransform): + """Normalize each image in a multi-view image list.""" + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def transform(self, results): + results['img'] = [ + mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) + for img in results['img'] + ] + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + +@TRANSFORMS.register_module() +class MyResize(BaseTransform): + """Resize each image in a multi-view image list.""" + + def __init__(self, scale, keep_ratio=False, interpolation='bilinear'): + if isinstance(scale, int): + self.scale = (scale, scale) + else: + self.scale = scale + self.keep_ratio = keep_ratio + self.interpolation = interpolation + + def transform(self, results): + resized = [] + scale_factor = None + for img in results['img']: + if self.keep_ratio: + img_out, scale_factor = mmcv.imrescale( + img, self.scale, + interpolation=self.interpolation, + return_scale=True) + else: + img_out, w_scale, h_scale = mmcv.imresize( + img, self.scale, + interpolation=self.interpolation, + return_scale=True) + scale_factor = (w_scale, h_scale) + resized.append(img_out) + results['img'] = resized + results['img_shape'] = resized[0].shape[:2] + if scale_factor is not None: + if isinstance(scale_factor, (int, float)): + # imrescale returns a single uniform scale + results['scale_factor'] = np.array([scale_factor, scale_factor], dtype=np.float32) + else: + results['scale_factor'] = np.array([scale_factor[0], scale_factor[1]], dtype=np.float32) + return results + + +@TRANSFORMS.register_module() +class MyPad(BaseTransform): + """Pad each image in a multi-view image list.""" + + def __init__(self, size=None, size_divisor=None, pad_val=0): + assert size is not None or size_divisor is not None + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + + def transform(self, results): + padded = [] + for img in results['img']: + if self.size_divisor is not None: + h, w = img.shape[:2] + pad_h = int(np.ceil(h / self.size_divisor)) * self.size_divisor + pad_w = int(np.ceil(w / self.size_divisor)) * self.size_divisor + size = (pad_h, pad_w) + else: + size = self.size[::-1] + pad_val = self.pad_val + if isinstance(pad_val, int) and img.ndim == 3: + pad_val = tuple(pad_val for _ in range(img.shape[2])) + padded.append(mmcv.impad(img, shape=size, pad_val=pad_val)) + results['img'] = padded + results['pad_shape'] = padded[0].shape + results['input_shape'] = padded[0].shape[:2] + return results diff --git a/mmdetection3d/projects/TransFusion/transfusion.py b/mmdetection3d/projects/TransFusion/transfusion.py new file mode 100644 index 0000000..9ad48eb --- /dev/null +++ b/mmdetection3d/projects/TransFusion/transfusion.py @@ -0,0 +1,90 @@ +from typing import Dict, List, Optional + +import torch +from torch import Tensor + +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from mmdet3d.registry import MODELS +from mmdet3d.structures import Det3DDataSample + + +@MODELS.register_module() +class TransFusionDetector(MVXTwoStageDetector): + """TransFusion detector for LiDAR-Camera 3D object detection. + + Inherits from MVXTwoStageDetector which provides: + - extract_feat (voxelize -> voxel_encoder -> middle_encoder -> backbone -> neck) + - loss (pts_bbox_head.loss) + + This subclass adds: + - Image freezing logic for the two-stage training scheme + - predict() override to pass batch_input_metas (list of dicts) instead + of batch_data_samples to TransFusionHead.predict(), which expects dicts + """ + + def __init__(self, freeze_img: bool = False, **kwargs): + super().__init__(**kwargs) + self.freeze_img = freeze_img + if self.freeze_img: + if self.with_img_backbone: + for param in self.img_backbone.parameters(): + param.requires_grad = False + if self.with_img_neck: + for param in self.img_neck.parameters(): + param.requires_grad = False + + def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]], + batch_data_samples: List[Det3DDataSample], + **kwargs) -> List[Det3DDataSample]: + batch_input_metas = [item.metainfo for item in batch_data_samples] + img_feats, pts_feats = self.extract_feat(batch_inputs_dict, + batch_input_metas) + #for input_meta in batch_input_metas: + # print(f"input_meta keys: {input_meta.keys()}") + + if img_feats is None: + results_list_3d = self.pts_bbox_head.predict( + (pts_feats, None), batch_input_metas, **kwargs) + else: + results_list_3d = self.pts_bbox_head.predict( + (pts_feats, img_feats), batch_input_metas, **kwargs) + + detsamples = self.add_pred_to_datasample(batch_data_samples, + results_list_3d, + None) + return detsamples + + + def loss(self, batch_inputs_dict: Dict[List, torch.Tensor], + batch_data_samples: List[Det3DDataSample], + **kwargs) -> List[Det3DDataSample]: + """ + Args: + batch_inputs_dict (dict): The model input dict which include + 'points' and `imgs` keys. + + - points (list[torch.Tensor]): Point cloud of each sample. + - imgs (torch.Tensor): Tensor of batch images, has shape + (B, C, H ,W) + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`, . + + Returns: + dict[str, Tensor]: A dictionary of loss components. + + """ + + batch_input_metas = [item.metainfo for item in batch_data_samples] + #for input_meta in batch_input_metas: + # print(f"input_meta keys: {input_meta.keys()}") + img_feats, pts_feats = self.extract_feat(batch_inputs_dict, + batch_input_metas) + losses = dict() + + if img_feats is None: + loss = self.pts_bbox_head.loss((pts_feats, None), batch_data_samples, **kwargs) + else: + loss = self.pts_bbox_head.loss((pts_feats, img_feats), batch_data_samples, **kwargs) + losses.update(loss ) + return losses diff --git a/mmdetection3d/projects/TransFusion/transfusion_head.py b/mmdetection3d/projects/TransFusion/transfusion_head.py new file mode 100644 index 0000000..f5559cd --- /dev/null +++ b/mmdetection3d/projects/TransFusion/transfusion_head.py @@ -0,0 +1,828 @@ +import copy +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from mmcv.cnn import ConvModule, build_conv_layer +from mmdet.models.task_modules import (AssignResult, PseudoSampler, + build_assigner, build_bbox_coder, + build_sampler) +from mmdet.models.utils import multi_apply +from mmdet3d.registry import MODELS +from mmdet3d.models.utils import clip_sigmoid, gaussian_radius, draw_heatmap_gaussian +from mmdet3d.structures import xywhr2xyxyr, LiDARInstance3DBoxes +from mmdet3d.models.dense_heads.centerpoint_head import SeparateHead +from mmdet3d.models.layers.fusion_layers import apply_3d_transformation +from mmdet3d.models import circle_nms, nms_bev +from projects.BEVFusion.bevfusion.transformer import PositionEncodingLearned + + +class TransFusion_TransformerDecoderLayer(nn.Module): + # The decoder is same with bevfusion. It is a bit confusing in the qkv +query_pos part but the query_pos is added in mha for the bevfusion version + # MultiheadAttention is replaced with nn.MultiheadAttention to reduce maintenance + # FFN (SeparateHead) is from https://github.com/open-mmlab/mmdetection3d/blob/fe25f7a51d36e3702f961e198894580d83c4387b/mmdet3d/models/dense_heads/centerpoint_head.py#L20 + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", + self_posembed=None, cross_posembed=None, cross_only=False): + super().__init__() + self.cross_only = cross_only + if not self.cross_only: + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(F"activation should be relu/gelu, not {activation}.") + + self.activation = _get_activation_fn(activation) + + self.self_posembed = self_posembed + self.cross_posembed = cross_posembed + + def with_pos_embed(self, tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, query, key, query_pos, key_pos, attn_mask=None): + """ + :param query: B C Pq + :param key: B C Pk + :param query_pos: B Pq 3/6 + :param key_pos: B Pk 3/6 + :param value_pos: [B Pq 3/6] + :return: + """ + # NxCxP to PxNxC + if self.self_posembed is not None: + query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1) + else: + query_pos_embed = None + if self.cross_posembed is not None: + key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1) + else: + key_pos_embed = None + + query = query.permute(2, 0, 1) + key = key.permute(2, 0, 1) + + if not self.cross_only: + q = k = v = self.with_pos_embed(query, query_pos_embed) + query2 = self.self_attn(q, k, value=v)[0] + query = query + self.dropout1(query2) + query = self.norm1(query) + + query2 = self.multihead_attn(query=self.with_pos_embed(query, query_pos_embed), + key=self.with_pos_embed(key, key_pos_embed), + value=self.with_pos_embed(key, key_pos_embed), attn_mask=attn_mask)[0] + query = query + self.dropout2(query2) + query = self.norm2(query) + + query2 = self.linear2(self.dropout(self.activation(self.linear1(query)))) + query = query + self.dropout3(query2) + query = self.norm3(query) + + # NxCxP to PxNxC + query = query.permute(1, 2, 0) + return query + + +@MODELS.register_module() +class TransFusion_TransFusionHead(BaseModule): + # Because we are including BEVFusion, this name is to avoid conflict. + # One difference between BEVFusion and Transfusion is theat the fusion for Transfusion is done in the head + def __init__(self, + fuse_img=False, + num_views=0, + in_channels_img=64, + out_size_factor_img=4, + learnable_query_pos=False, + initialize_by_heatmap=False, + num_proposals=128, + auxiliary=True, + in_channels=128 * 3, + hidden_channel=128, + num_classes=4, + # config for Transformer + num_decoder_layers=3, + num_heads=8, + ffn_channel=256, + dropout=0.1, + activation='relu', + nms_kernel_size=1, + bn_momentum=0.1, + # config for FFN + common_heads=dict(), + num_heatmap_convs=2, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + bias='auto', + # loss + loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'), + loss_bbox=dict(type='mmdet.L1Loss', reduction='mean'), + loss_heatmap=dict(type='mmdet.GaussianFocalLoss', reduction='mean'), + # others + train_cfg=None, + test_cfg=None, + bbox_coder=None, + init_cfg=None): + super(TransFusion_TransFusionHead, self).__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.num_proposals = num_proposals + self.auxiliary = auxiliary + self.in_channels = in_channels + self.num_decoder_layers = num_decoder_layers + self.bn_momentum = bn_momentum + self.learnable_query_pos = learnable_query_pos + self.initialize_by_heatmap = initialize_by_heatmap + self.nms_kernel_size = nms_kernel_size + if self.initialize_by_heatmap is True: + assert self.learnable_query_pos is False, "initialized by heatmap is conflicting with learnable query position" + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.loss_heatmap = MODELS.build(loss_heatmap) + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.sampling = False + + # a shared convolution + self.shared_conv = build_conv_layer( + dict(type='Conv2d'), in_channels, hidden_channel, kernel_size=3, padding=1, bias=bias) + + if self.initialize_by_heatmap: + layers = [] + layers.append(ConvModule(hidden_channel, hidden_channel, kernel_size=3, padding=1, bias=bias, + conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'))) + layers.append(build_conv_layer(dict(type='Conv2d'), hidden_channel, num_classes, kernel_size=3, padding=1, bias=bias)) + self.heatmap_head = nn.Sequential(*layers) + self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1) + else: + # query feature + self.query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_proposals)) + self.query_pos = nn.Parameter(torch.rand([1, self.num_proposals, 2]), requires_grad=learnable_query_pos) + + # transformer decoder layers for object query with LiDAR feature + self.decoder = nn.ModuleList() + for i in range(self.num_decoder_layers): + self.decoder.append( + TransFusion_TransformerDecoderLayer( + hidden_channel, num_heads, ffn_channel, dropout, activation, + self_posembed=PositionEncodingLearned(2, hidden_channel), + cross_posembed=PositionEncodingLearned(2, hidden_channel), + )) + + # Prediction Head + self.prediction_heads = nn.ModuleList() + for i in range(self.num_decoder_layers): + heads = copy.deepcopy(common_heads) + heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs))) + self.prediction_heads.append(SeparateHead(hidden_channel, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)) + + self.fuse_img = fuse_img + if self.fuse_img: + self.num_views = num_views + self.out_size_factor_img = out_size_factor_img + self.shared_conv_img = build_conv_layer(dict(type='Conv2d'), in_channels_img, hidden_channel, kernel_size=3, padding=1, bias=bias) + self.heatmap_head_img = copy.deepcopy(self.heatmap_head) + + # transformer decoder layers for img fusion + self.decoder.append( + TransFusion_TransformerDecoderLayer( + hidden_channel, num_heads, ffn_channel, dropout, activation, + self_posembed=PositionEncodingLearned(2, hidden_channel), + cross_posembed=PositionEncodingLearned(2, hidden_channel), + )) + # cross-attention only layers for projecting img feature onto BEV + for i in range(num_views): + self.decoder.append( + TransFusion_TransformerDecoderLayer( + hidden_channel, num_heads, ffn_channel, dropout, activation, + self_posembed=PositionEncodingLearned(2, hidden_channel), + cross_posembed=PositionEncodingLearned(2, hidden_channel), + cross_only=True, + )) + self.fc = nn.Sequential(*[nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)]) + + heads = copy.deepcopy(common_heads) + heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs))) + self.prediction_heads.append(SeparateHead(hidden_channel * 2, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)) + + self.init_weights() + self._init_assigner_sampler() + + # Position Embedding for Cross-Attention, which is re-used during training + x_size = self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'] + y_size = self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor'] + self.bev_pos = self.create_2D_grid(x_size, y_size) + + self.img_feat_pos = None + self.img_feat_collapsed_pos = None + + def create_2D_grid(self, x_size, y_size): + meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]] + batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) + coord_base = torch.cat([batch_x[None] + 0.5, batch_y[None] + 0.5], dim=0)[None] + return coord_base.view(1, 2, -1).permute(0, 2, 1) + + def init_weights(self): + # initialize transformer + for m in self.decoder.parameters(): + if m.dim() > 1: + nn.init.xavier_uniform_(m) + if hasattr(self, 'query'): + nn.init.xavier_normal_(self.query) + self.init_bn_momentum() + + def init_bn_momentum(self): + for m in self.modules(): + if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)): + m.momentum = self.bn_momentum + + def _init_assigner_sampler(self): + """Initialize the target assigner and sampler of the head.""" + if self.train_cfg is None: return + self.bbox_sampler = PseudoSampler() + self.bbox_assigner = build_assigner(self.train_cfg.assigner) + + def forward(self, feats, batch_input_metas): + """Forward pass. Takes (pts_feats, img_feats) from TransFusionDetector.""" + pts_feats, img_feats = feats + if img_feats is None: + img_feats = [None] + res = multi_apply(self.forward_single, pts_feats, img_feats, [batch_input_metas]) + return res + + def forward_single(self, lidar_feat_in, img_inputs, img_metas): + batch_size = lidar_feat_in.shape[0] + lidar_feat = self.shared_conv(lidar_feat_in) + lidar_feat_flatten = lidar_feat.view(batch_size, lidar_feat.shape[1], -1) # [BS, C, H*W] + bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) + + ################################# + # image to BEV + ################################# + if self.fuse_img: + img_feat = self.shared_conv_img(img_inputs) # [BS * n_views, C, H, W] + + num_channel = img_feat.shape[1] + if img_feat.dim() == 5: + # [BS, n_views, C, H, W] -> flatten view + img_feat = img_feat.flatten(0, 1) + + img_h, img_w = img_inputs.shape[-2], img_inputs.shape[-1] + # Reshape back to [BS, n_views, C, H, W] + raw_img_feat = img_feat.view(batch_size, self.num_views, num_channel, img_feat.shape[-2], img_feat.shape[-1]).permute(0, 2, 3, 1, 4) + + # Collapse height + img_feat_collapsed = raw_img_feat.reshape(batch_size, num_channel, img_feat.shape[-2], -1).max(2).values + img_feat_collapsed = self.fc(img_feat_collapsed) + + # positional encoding for image guided query initialization + if self.img_feat_collapsed_pos is None: + img_feat_collapsed_pos = self.img_feat_collapsed_pos = self.create_2D_grid(1, img_feat_collapsed.shape[-1]).to(img_feat.device) + else: + img_feat_collapsed_pos = self.img_feat_collapsed_pos + + bev_feat = lidar_feat_flatten + for idx_view in range(self.num_views): + bev_feat = self.decoder[self.num_decoder_layers + 1 + idx_view]( + bev_feat, + img_feat_collapsed[..., img_w * idx_view:img_w * (idx_view + 1)], + bev_pos, + img_feat_collapsed_pos[:, img_w * idx_view:img_w * (idx_view + 1)] + ) + + ################################# + # image guided query initialization + ################################# + if self.initialize_by_heatmap: + with torch.autocast('cuda', enabled=False): + dense_heatmap = self.heatmap_head(lidar_feat.float()) + dense_heatmap_img = None + if self.fuse_img: + with torch.autocast('cuda', enabled=False): + dense_heatmap_img = self.heatmap_head_img(bev_feat.view(lidar_feat.shape).float()) + heatmap = (dense_heatmap.detach().sigmoid() + dense_heatmap_img.detach().sigmoid()) / 2 + else: + heatmap = dense_heatmap.detach().sigmoid() + + padding = self.nms_kernel_size // 2 + local_max = torch.zeros_like(heatmap) + local_max_inner = F.max_pool2d(heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0) + local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner + + ## for Pedestrian & Traffic_cone in nuScenes + if self.test_cfg['dataset'] == 'nuScenes': + local_max[:, 8, ] = F.max_pool2d(heatmap[:, 8], kernel_size=1, stride=1, padding=0) + local_max[:, 9, ] = F.max_pool2d(heatmap[:, 9], kernel_size=1, stride=1, padding=0) + elif self.test_cfg['dataset'] == 'Waymo': + local_max[:, 1, ] = F.max_pool2d(heatmap[:, 1], kernel_size=1, stride=1, padding=0) + local_max[:, 2, ] = F.max_pool2d(heatmap[:, 2], kernel_size=1, stride=1, padding=0) + + heatmap = heatmap * (heatmap == local_max) + heatmap = heatmap.view(batch_size, heatmap.shape[1], -1) + + # top #num_proposals among all classes + top_proposals = heatmap.view(batch_size, -1).topk(self.num_proposals, dim=-1).indices + top_proposals_class = top_proposals // heatmap.shape[-1] + top_proposals_index = top_proposals % heatmap.shape[-1] + + query_feat = lidar_feat_flatten.gather(index=top_proposals_index[:, None, :].expand(-1, lidar_feat_flatten.shape[1], -1), dim=-1) + self.query_labels = top_proposals_class + + # add category embedding + one_hot = F.one_hot(top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) + query_cat_encoding = self.class_encoding(one_hot.float()) + query_feat += query_cat_encoding + + query_pos = bev_pos.gather(index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), dim=1) + else: + query_feat = self.query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals] + query_pos = self.query_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, num_proposals, 2] + + ################################# + # transformer decoder layer (LiDAR feature as K,V) + ################################# + ret_dicts = [] + for i in range(self.num_decoder_layers): + + query_feat = self.decoder[i](query_feat, lidar_feat_flatten, query_pos, bev_pos) + + res_layer = self.prediction_heads[i](query_feat) + res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1) + first_res_layer = res_layer + + if not self.fuse_img: + ret_dicts.append(res_layer) + + # for next level positional embedding + query_pos = res_layer['center'].detach().clone().permute(0, 2, 1) + + ################################# + # transformer decoder layer (img feature as K,V) + ################################# + if self.fuse_img: + # positional encoding for image fusion + img_feat = raw_img_feat.permute(0, 3, 1, 2, 4) # [BS, n_views, C, H, W] + img_feat_flatten = img_feat.view(batch_size, self.num_views, num_channel, -1) # [BS, n_views, C, H*W] + + if self.img_feat_pos is None: + (h, w) = img_inputs.shape[-2], img_inputs.shape[-1] + img_feat_pos = self.img_feat_pos = self.create_2D_grid(h, w).to(img_feat_flatten.device) + else: + img_feat_pos = self.img_feat_pos + + prev_query_feat = query_feat.detach().clone() + query_feat = torch.zeros_like(query_feat) # create new container for img query feature + + # Prepare 3D Query Positions + query_pos_realmetric = query_pos.permute(0, 2, 1) * self.test_cfg['out_size_factor'] * self.test_cfg['voxel_size'][0] + self.test_cfg['pc_range'][0] + query_pos_3d = torch.cat([query_pos_realmetric, res_layer['height']], dim=1).detach().clone() + + if 'vel' in res_layer: + vel = copy.deepcopy(res_layer['vel'].detach()) + else: + vel = None + + pred_boxes = self.bbox_coder.decode( + res_layer['heatmap'].detach().clone(), + res_layer['rot'].detach().clone(), + res_layer['dim'].detach().clone(), + res_layer['center'].detach().clone(), + res_layer['height'].detach().clone(), + vel, + ) + + on_the_image_mask = torch.ones([batch_size, self.num_proposals]).to(query_pos_3d.device) * -1 + + # Fusion Loop + for sample_idx in range(batch_size): + #print(img_metas[sample_idx].keys()) + lidar2img_rt = query_pos_3d.new_tensor(img_metas[sample_idx]['lidar2img']) + img_scale_factor = ( + query_pos_3d.new_tensor(img_metas[sample_idx]['scale_factor'][:2] + if 'scale_factor' in img_metas[sample_idx].keys() else [1.0, 1.0])) + img_flip = img_metas[sample_idx]['flip'] if 'flip' in img_metas[sample_idx].keys() else False + img_crop_offset = ( + query_pos_3d.new_tensor(img_metas[sample_idx]['img_crop_offset']) + if 'img_crop_offset' in img_metas[sample_idx].keys() else 0) + img_shape = img_metas[sample_idx]['img_shape'][:2] + img_pad_shape = img_metas[sample_idx]['input_shape'][:2] + + # Use Box3DMode (or manual instantiation) to get corners + # This preserves the active fusion geometry + current_bboxes = pred_boxes[sample_idx]['bboxes'] + if current_bboxes.shape[0] == 0: + boxes_corners = torch.zeros(0, 8, 3).to(current_bboxes.device) + else: + # Perhaps not the best way + # LiDARInstance3DBoxes is available from imports + boxes = LiDARInstance3DBoxes(current_bboxes[:, :7], box_dim=7) + boxes_corners = boxes.corners + + # [N, 8, 3] -> [3, N*8] + corners_3d = boxes_corners.permute(2, 0, 1).view(3, -1) + query_pos_3d_with_corners = torch.cat([query_pos_3d[sample_idx], corners_3d], dim=-1) + + if batch_size == 1: + points = query_pos_3d_with_corners.T + else: + points = apply_3d_transformation(query_pos_3d_with_corners.T, 'LIDAR', img_metas[sample_idx], reverse=True).detach() + + num_points = points.shape[0] + + for view_idx in range(self.num_views): + pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1) + pts_2d = pts_4d @ lidar2img_rt[view_idx].t() + + pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5) + pts_2d[:, 0] /= pts_2d[:, 2] + pts_2d[:, 1] /= pts_2d[:, 2] + + img_coors = pts_2d[:, 0:2] * img_scale_factor + img_coors -= img_crop_offset + coor_x, coor_y = torch.split(img_coors, 1, dim=1) + + if img_flip: + orig_h, orig_w = img_shape + coor_x = orig_w - coor_x + + coor_x, coor_corner_x = coor_x[0:self.num_proposals, :], coor_x[self.num_proposals:, :] + coor_y, coor_corner_y = coor_y[0:self.num_proposals, :], coor_y[self.num_proposals:, :] + + coor_corner_x = coor_corner_x.reshape(self.num_proposals, 8, 1) + coor_corner_y = coor_corner_y.reshape(self.num_proposals, 8, 1) + coor_corner_xy = torch.cat([coor_corner_x, coor_corner_y], dim=-1) + + h, w = img_pad_shape + on_the_image = (coor_x > 0) * (coor_x < w) * (coor_y > 0) * (coor_y < h) + on_the_image = on_the_image.squeeze() + + if on_the_image.sum() <= 1: + continue + on_the_image_mask[sample_idx, on_the_image] = view_idx + + # Spatial Modulation (SMCA) + center_ys = (coor_y[on_the_image] / self.out_size_factor_img) + center_xs = (coor_x[on_the_image] / self.out_size_factor_img) + centers = torch.cat([center_xs, center_ys], dim=-1).int() + + corners = (coor_corner_xy[on_the_image].max(1).values - coor_corner_xy[on_the_image].min(1).values) / self.out_size_factor_img + radius = torch.ceil(corners.norm(dim=-1, p=2) / 2).int() + sigma = (radius * 2 + 1) / 6.0 + + distance = (centers[:, None, :] - (img_feat_pos - 0.5)).norm(dim=-1) ** 2 + gaussian_mask = (-distance / (2 * sigma[:, None] ** 2)).exp() + gaussian_mask[gaussian_mask < torch.finfo(torch.float32).eps] = 0 + attn_mask = gaussian_mask + + query_feat_view = prev_query_feat[sample_idx, :, on_the_image] + query_pos_view = torch.cat([center_xs, center_ys], dim=-1) + + query_feat_view = self.decoder[self.num_decoder_layers]( + query_feat_view[None], + img_feat_flatten[sample_idx:sample_idx + 1, view_idx], + query_pos_view[None], + img_feat_pos, + attn_mask=attn_mask.log() + ) + query_feat[sample_idx, :, on_the_image] = query_feat_view.clone() + + self.on_the_image_mask = (on_the_image_mask != -1) + res_layer = self.prediction_heads[self.num_decoder_layers](torch.cat([query_feat, prev_query_feat], dim=1)) + res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1) + + # Mask out invalid predictions + for key, value in res_layer.items(): + pred_dim = value.shape[1] + mask = ~self.on_the_image_mask.unsqueeze(1).expand(-1, pred_dim, -1) + res_layer[key][mask] = first_res_layer[key][mask] + ret_dicts.append(res_layer) + + if self.initialize_by_heatmap: + ret_dicts[0]['query_heatmap_score'] = heatmap.gather(index=top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) + if self.fuse_img: + ret_dicts[0]['dense_heatmap'] = dense_heatmap_img + else: + ret_dicts[0]['dense_heatmap'] = dense_heatmap + + if self.auxiliary is False: + return [ret_dicts[-1]] + + new_res = {} + for key in ret_dicts[0].keys(): + if key not in ['dense_heatmap', 'dense_heatmap_old', 'query_heatmap_score']: + new_res[key] = torch.cat([ret_dict[key] for ret_dict in ret_dicts], dim=-1) + else: + new_res[key] = ret_dicts[0][key] + return [new_res] + + def loss(self, feats, batch_data_samples, **kwargs): + """Modern loss entry point.""" + batch_input_metas = [d.metainfo for d in batch_data_samples] + batch_gt_instances_3d = [d.gt_instances_3d for d in batch_data_samples] + + preds_dicts = self(feats, batch_input_metas) + loss = self.loss_by_feat(preds_dicts, batch_gt_instances_3d) + return loss + + def loss_by_feat(self, preds_dicts, batch_gt_instances_3d): + """Calculate loss based on features.""" + # Fix: Dynamic unpacking based on initialization mode + targets = self.get_targets(batch_gt_instances_3d, preds_dicts[0]) + + if self.initialize_by_heatmap: + (labels, label_weights, bbox_targets, bbox_weights, + ious, num_pos, matched_ious, heatmap) = targets + else: + (labels, label_weights, bbox_targets, bbox_weights, + ious, num_pos, matched_ious) = targets + heatmap = None + + if hasattr(self, 'on_the_image_mask'): + label_weights = label_weights * self.on_the_image_mask + bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None] + num_pos = bbox_weights.max(-1).values.sum() + + preds_dict = preds_dicts[0][0] + loss_dict = dict() + + if self.initialize_by_heatmap and heatmap is not None: + loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict['dense_heatmap']), heatmap, avg_factor=max(heatmap.eq(1).float().sum().item(), 1)) + loss_dict['loss_heatmap'] = loss_heatmap + + # Compute loss for each layer + for idx_layer in range(self.num_decoder_layers if self.auxiliary else 1): + if idx_layer == self.num_decoder_layers - 1 or (idx_layer == 0 and self.auxiliary is False): + prefix = 'layer_-1' + else: + prefix = f'layer_{idx_layer}' + + layer_labels = labels[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1) + layer_label_weights = label_weights[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1) + layer_score = preds_dict['heatmap'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + layer_cls_score = layer_score.permute(0, 2, 1).reshape(-1, self.num_classes) + + layer_loss_cls = self.loss_cls(layer_cls_score, layer_labels, layer_label_weights, avg_factor=max(num_pos, 1)) + + layer_center = preds_dict['center'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + layer_height = preds_dict['height'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + layer_rot = preds_dict['rot'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + layer_dim = preds_dict['dim'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot], dim=1).permute(0, 2, 1) + + if 'vel' in preds_dict.keys(): + layer_vel = preds_dict['vel'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals] + preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot, layer_vel], dim=1).permute(0, 2, 1) + + code_weights = self.train_cfg.get('code_weights', None) + layer_bbox_weights = bbox_weights[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :] + layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(code_weights) + layer_bbox_targets = bbox_targets[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :] + layer_loss_bbox = self.loss_bbox(preds, layer_bbox_targets, layer_reg_weights, avg_factor=max(num_pos, 1)) + + loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls + loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox + + loss_dict[f'matched_ious'] = layer_loss_cls.new_tensor(matched_ious) + return loss_dict + + def get_targets(self, batch_gt_instances_3d, preds_dict): + """Generate training targets using InstanceData.""" + # Convert preds_dict to list of dicts for multi_apply + list_of_pred_dict = [] + for batch_idx in range(len(batch_gt_instances_3d)): + pred_dict = {} + for key in preds_dict[0].keys(): + pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1] + list_of_pred_dict.append(pred_dict) + + res_tuple = multi_apply(self.get_targets_single, batch_gt_instances_3d, list_of_pred_dict, np.arange(len(batch_gt_instances_3d))) + + labels = torch.cat(res_tuple[0], dim=0) + label_weights = torch.cat(res_tuple[1], dim=0) + bbox_targets = torch.cat(res_tuple[2], dim=0) + bbox_weights = torch.cat(res_tuple[3], dim=0) + ious = torch.cat(res_tuple[4], dim=0) + num_pos = np.sum(res_tuple[5]) + matched_ious = np.mean(res_tuple[6]) + + if self.initialize_by_heatmap: + heatmap = torch.cat(res_tuple[7], dim=0) + return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious, heatmap + else: + return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious + + def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx): + """Generate training targets for a single sample.""" + # Unpack InstanceData + gt_bboxes_3d = gt_instances_3d.bboxes_3d + gt_labels_3d = gt_instances_3d.labels_3d + num_proposals = preds_dict['center'].shape[-1] + + score = copy.deepcopy(preds_dict['heatmap'].detach()) + center = copy.deepcopy(preds_dict['center'].detach()) + height = copy.deepcopy(preds_dict['height'].detach()) + dim = copy.deepcopy(preds_dict['dim'].detach()) + rot = copy.deepcopy(preds_dict['rot'].detach()) + vel = copy.deepcopy(preds_dict['vel'].detach()) if 'vel' in preds_dict.keys() else None + + boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) + bboxes_tensor = boxes_dict[0]['bboxes'] + gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device) + + if self.auxiliary: + num_layer = self.num_decoder_layers + else: + num_layer = 1 + + assign_result_list = [] + for idx_layer in range(num_layer): + bboxes_tensor_layer = bboxes_tensor[self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1), :] + score_layer = score[..., self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1)] + + if self.train_cfg.assigner.type == 'HungarianAssigner3D': + assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, gt_labels_3d, score_layer, self.train_cfg) + elif self.train_cfg.assigner.type == 'HeuristicAssigner': + assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, None, gt_labels_3d, self.query_labels[batch_idx]) + else: + raise NotImplementedError + assign_result_list.append(assign_result) + + assign_result_ensemble = AssignResult( + num_gts=sum([res.num_gts for res in assign_result_list]), + gt_inds=torch.cat([res.gt_inds for res in assign_result_list]), + max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]), + labels=torch.cat([res.labels for res in assign_result_list]), + ) + + # Sampling (Use PseudoSampler) + gt_instances = InstanceData(bboxes=gt_bboxes_tensor) + pred_instances = InstanceData(priors=bboxes_tensor) + sampling_result = self.bbox_sampler.sample(assign_result_ensemble, pred_instances, gt_instances) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # Create target for loss computation + bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device) + bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device) + ious = assign_result_ensemble.max_overlaps + ious = torch.clamp(ious, min=0.0, max=1.0) + labels = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long) + label_weights = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long) + + if gt_labels_3d is not None: + labels += self.num_classes + + if len(pos_inds) > 0: + pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_gt_bboxes) + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + if gt_labels_3d is None: + labels[pos_inds] = 1 + else: + labels[pos_inds] = gt_labels_3d[sampling_result.pos_assigned_gt_inds] + if self.train_cfg.pos_weight <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg.pos_weight + + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + if self.initialize_by_heatmap: + device = labels.device + gt_bboxes_3d_tensor = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device) + grid_size = torch.tensor(self.train_cfg['grid_size']) + pc_range = torch.tensor(self.train_cfg['point_cloud_range']) + voxel_size = torch.tensor(self.train_cfg['voxel_size']) + feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] + heatmap = gt_bboxes_3d.tensor.new_zeros(self.num_classes, feature_map_size[1], feature_map_size[0]) + + for idx in range(len(gt_bboxes_3d)): + width = gt_bboxes_3d_tensor[idx][3] + length = gt_bboxes_3d_tensor[idx][4] + width = width / voxel_size[0] / self.train_cfg['out_size_factor'] + length = length / voxel_size[1] / self.train_cfg['out_size_factor'] + if width > 0 and length > 0: + radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap']) + radius = max(self.train_cfg['min_radius'], int(radius)) + x, y = gt_bboxes_3d_tensor[idx][0], gt_bboxes_3d_tensor[idx][1] + + coor_x = (x - pc_range[0]) / voxel_size[0] / self.train_cfg['out_size_factor'] + coor_y = (y - pc_range[1]) / voxel_size[1] / self.train_cfg['out_size_factor'] + + center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) + center_int = center.to(torch.int32) + draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius) + + mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1) + return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou), heatmap[None] + else: + mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1) + return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou) + + def predict(self, feats, batch_input_metas, **kwargs): + """Modern predict entry point.""" + preds_dicts = self(feats, batch_input_metas) + return self.predict_by_feat(preds_dicts, batch_input_metas) + + def predict_by_feat(self, preds_dicts, metas, img=None, rescale=False, for_roi=False): + """Generate bboxes from bbox head predictions.""" + rets = [] + for layer_id, preds_dict in enumerate(preds_dicts): + batch_size = preds_dict[0]['heatmap'].shape[0] + batch_score = preds_dict[0]['heatmap'][..., -self.num_proposals:].sigmoid() + + one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1) + batch_score = batch_score * preds_dict[0]['query_heatmap_score'] * one_hot + + batch_center = preds_dict[0]['center'][..., -self.num_proposals:] + batch_height = preds_dict[0]['height'][..., -self.num_proposals:] + batch_dim = preds_dict[0]['dim'][..., -self.num_proposals:] + batch_rot = preds_dict[0]['rot'][..., -self.num_proposals:] + batch_vel = None + if 'vel' in preds_dict[0]: + batch_vel = preds_dict[0]['vel'][..., -self.num_proposals:] + + temp = self.bbox_coder.decode(batch_score, batch_rot, batch_dim, batch_center, batch_height, batch_vel, filter=True) + + if self.test_cfg['dataset'] == 'nuScenes': + self.tasks = [ + dict(num_class=8, class_names=[], indices=[0, 1, 2, 3, 4, 5, 6, 7], radius=-1), + dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.175), + dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.175), + ] + elif self.test_cfg['dataset'] == 'Waymo': + self.tasks = [ + dict(num_class=1, class_names=['Car'], indices=[0], radius=0.7), + dict(num_class=1, class_names=['Pedestrian'], indices=[1], radius=0.7), + dict(num_class=1, class_names=['Cyclist'], indices=[2], radius=0.7), + ] + + ret_layer = [] + for i in range(batch_size): + boxes3d = temp[i]['bboxes'] + scores = temp[i]['scores'] + labels = temp[i]['labels'] + + # NMS Logic + if self.test_cfg['nms_type'] is not None: + keep_mask = torch.zeros_like(scores) + for task in self.tasks: + task_mask = torch.zeros_like(scores) + for cls_idx in task['indices']: + task_mask += labels == cls_idx + task_mask = task_mask.bool() + if task['radius'] > 0: + if self.test_cfg['nms_type'] == 'circle': + boxes_for_nms = torch.cat([boxes3d[task_mask][:, :2], scores[:, None][task_mask]], dim=1) + task_keep_indices = torch.tensor(circle_nms(boxes_for_nms.detach().cpu().numpy(), task['radius'])) + else: + boxes_for_nms = xywhr2xyxyr(metas[i]['box_type_3d'](boxes3d[task_mask][:, :7], 7).bev) + top_scores = scores[task_mask] + task_keep_indices = nms_bev(boxes_for_nms, top_scores, thresh=task['radius'], + pre_maxsize=self.test_cfg['pre_maxsize'], + post_max_size=self.test_cfg['post_maxsize']) + else: + task_keep_indices = torch.arange(task_mask.sum()) + if task_keep_indices.shape[0] != 0: + keep_indices = torch.where(task_mask != 0)[0][task_keep_indices] + keep_mask[keep_indices] = 1 + keep_mask = keep_mask.bool() + ret = dict(bboxes=boxes3d[keep_mask], scores=scores[keep_mask], labels=labels[keep_mask]) + else: + ret = dict(bboxes=boxes3d, scores=scores, labels=labels) + + # Wrap in InstanceData + inst = InstanceData() + inst.bboxes_3d = metas[0]['box_type_3d'](ret['bboxes'], box_dim=ret['bboxes'].shape[-1]) + inst.scores_3d = ret['scores'] + inst.labels_3d = ret['labels'].int() + ret_layer.append(inst) + rets.append(ret_layer) + + return rets[0] \ No newline at end of file