From a119f5bbaaf10079c85450aea1d2980ca0283e41 Mon Sep 17 00:00:00 2001 From: xinyu Date: Mon, 24 Oct 2022 11:47:25 +0800 Subject: [PATCH 01/20] add data preparer --- .../_base_/datasets/wildreceipt-openset.py | 2 +- configs/kie/_base_/datasets/wildreceipt.py | 2 +- configs/textdet/_base_/datasets/icdar2015.py | 6 +- configs/textdet/_base_/datasets/totaltext.py | 17 + .../textrecog/_base_/datasets/icdar2015.py | 8 +- .../textrecog/_base_/datasets/totaltext.py | 17 + dataset_zoo/icdar2015/metafile.yml | 28 + dataset_zoo/icdar2015/textdet.py | 51 ++ dataset_zoo/icdar2015/textrecog.py | 42 ++ dataset_zoo/icdar2015/textspotting.py | 3 + dataset_zoo/totaltext/metafile.yml | 29 + dataset_zoo/totaltext/textdet.py | 39 ++ dataset_zoo/totaltext/textrecog.py | 3 + dataset_zoo/totaltext/textspotting.py | 3 + dataset_zoo/wildreceipt/kie.py | 32 + dataset_zoo/wildreceipt/metafile.yml | 29 + dataset_zoo/wildreceipt/textdet.py | 6 + dataset_zoo/wildreceipt/textrecog.py | 3 + dataset_zoo/wildreceipt/textspotting.py | 3 + docs/en/index.rst | 1 + .../data_prepare/dataset_preparer.md | 39 ++ docs/en/user_guides/data_prepare/det.md | 6 +- docs/en/user_guides/data_prepare/kie.md | 6 +- docs/en/user_guides/data_prepare/recog.md | 6 +- docs/zh_cn/index.rst | 1 + .../data_prepare/dataset_preparer.md | 39 ++ docs/zh_cn/user_guides/data_prepare/det.md | 6 +- docs/zh_cn/user_guides/data_prepare/kie.md | 6 +- docs/zh_cn/user_guides/data_prepare/recog.md | 4 +- mmocr/datasets/preparers/__init__.py | 13 + mmocr/datasets/preparers/data_converter.py | 629 ++++++++++++++++++ mmocr/datasets/preparers/data_obtainer.py | 148 +++++ mmocr/datasets/preparers/data_preparer.py | 111 ++++ mmocr/datasets/preparers/dumpers/__init__.py | 4 + mmocr/datasets/preparers/dumpers/dumpers.py | 30 + mmocr/datasets/preparers/parsers/__init__.py | 9 + mmocr/datasets/preparers/parsers/base.py | 37 ++ .../datasets/preparers/parsers/ic15_parser.py | 73 ++ mmocr/datasets/preparers/parsers/loaders.py | 93 +++ .../preparers/parsers/totaltext_parser.py | 34 + .../datasets/preparers/parsers/wildreceipt.py | 80 +++ mmocr/utils/__init__.py | 6 +- mmocr/utils/fileio.py | 73 +- tools/dataset_converters/prepare_dataset.py | 49 ++ 44 files changed, 1806 insertions(+), 20 deletions(-) create mode 100644 configs/textdet/_base_/datasets/totaltext.py create mode 100644 configs/textrecog/_base_/datasets/totaltext.py create mode 100644 dataset_zoo/icdar2015/metafile.yml create mode 100644 dataset_zoo/icdar2015/textdet.py create mode 100644 dataset_zoo/icdar2015/textrecog.py create mode 100644 dataset_zoo/icdar2015/textspotting.py create mode 100644 dataset_zoo/totaltext/metafile.yml create mode 100644 dataset_zoo/totaltext/textdet.py create mode 100644 dataset_zoo/totaltext/textrecog.py create mode 100644 dataset_zoo/totaltext/textspotting.py create mode 100644 dataset_zoo/wildreceipt/kie.py create mode 100644 dataset_zoo/wildreceipt/metafile.yml create mode 100644 dataset_zoo/wildreceipt/textdet.py create mode 100644 dataset_zoo/wildreceipt/textrecog.py create mode 100644 dataset_zoo/wildreceipt/textspotting.py create mode 100644 docs/en/user_guides/data_prepare/dataset_preparer.md create mode 100644 docs/zh_cn/user_guides/data_prepare/dataset_preparer.md create mode 100644 mmocr/datasets/preparers/__init__.py create mode 100644 mmocr/datasets/preparers/data_converter.py create mode 100644 mmocr/datasets/preparers/data_obtainer.py create mode 100644 mmocr/datasets/preparers/data_preparer.py create mode 100644 mmocr/datasets/preparers/dumpers/__init__.py create mode 100644 mmocr/datasets/preparers/dumpers/dumpers.py create mode 100644 mmocr/datasets/preparers/parsers/__init__.py create mode 100644 mmocr/datasets/preparers/parsers/base.py create mode 100644 mmocr/datasets/preparers/parsers/ic15_parser.py create mode 100644 mmocr/datasets/preparers/parsers/loaders.py create mode 100644 mmocr/datasets/preparers/parsers/totaltext_parser.py create mode 100644 mmocr/datasets/preparers/parsers/wildreceipt.py create mode 100644 tools/dataset_converters/prepare_dataset.py diff --git a/configs/kie/_base_/datasets/wildreceipt-openset.py b/configs/kie/_base_/datasets/wildreceipt-openset.py index 33274a7fb..f82512839 100644 --- a/configs/kie/_base_/datasets/wildreceipt-openset.py +++ b/configs/kie/_base_/datasets/wildreceipt-openset.py @@ -1,4 +1,4 @@ -wildreceipt_openset_data_root = 'data/kie/wildreceipt/' +wildreceipt_openset_data_root = 'data/wildreceipt/' wildreceipt_openset_train = dict( type='WildReceiptDataset', diff --git a/configs/kie/_base_/datasets/wildreceipt.py b/configs/kie/_base_/datasets/wildreceipt.py index b266c2e0f..9c1122edd 100644 --- a/configs/kie/_base_/datasets/wildreceipt.py +++ b/configs/kie/_base_/datasets/wildreceipt.py @@ -1,4 +1,4 @@ -wildreceipt_data_root = 'data/kie/wildreceipt/' +wildreceipt_data_root = 'data/wildreceipt/' wildreceipt_train = dict( type='WildReceiptDataset', diff --git a/configs/textdet/_base_/datasets/icdar2015.py b/configs/textdet/_base_/datasets/icdar2015.py index 6553d76a2..a14cb383a 100644 --- a/configs/textdet/_base_/datasets/icdar2015.py +++ b/configs/textdet/_base_/datasets/icdar2015.py @@ -1,9 +1,9 @@ -ic15_det_data_root = 'data/det/icdar2015' +ic15_det_data_root = 'data/icdar2015' ic15_det_train = dict( type='OCRDataset', data_root=ic15_det_data_root, - ann_file='instances_training.json', + ann_file='textdet_train.json', data_prefix=dict(img_path='imgs/'), filter_cfg=dict(filter_empty_gt=True, min_size=32), pipeline=None) @@ -11,7 +11,7 @@ ic15_det_test = dict( type='OCRDataset', data_root=ic15_det_data_root, - ann_file='instances_test.json', + ann_file='textdet_test.json', data_prefix=dict(img_path='imgs/'), test_mode=True, pipeline=None) diff --git a/configs/textdet/_base_/datasets/totaltext.py b/configs/textdet/_base_/datasets/totaltext.py new file mode 100644 index 000000000..b29ec6709 --- /dev/null +++ b/configs/textdet/_base_/datasets/totaltext.py @@ -0,0 +1,17 @@ +ic15_det_data_root = 'data/totaltext' + +ic15_det_train = dict( + type='OCRDataset', + data_root=ic15_det_data_root, + ann_file='textdet_train.json', + data_prefix=dict(img_path='imgs/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=None) + +ic15_det_test = dict( + type='OCRDataset', + data_root=ic15_det_data_root, + ann_file='textdet_test.json', + data_prefix=dict(img_path='imgs/'), + test_mode=True, + pipeline=None) diff --git a/configs/textrecog/_base_/datasets/icdar2015.py b/configs/textrecog/_base_/datasets/icdar2015.py index facdbd10e..bb15546c4 100644 --- a/configs/textrecog/_base_/datasets/icdar2015.py +++ b/configs/textrecog/_base_/datasets/icdar2015.py @@ -1,15 +1,17 @@ -ic15_rec_data_root = 'data/rec/icdar_2015/' +ic15_rec_data_root = 'data/icdar2015/' ic15_rec_train = dict( type='OCRDataset', data_root=ic15_rec_data_root, - ann_file='train_labels.json', + ann_file='textrecog_train.json', + data_prefix=dict(img_path='crops/'), test_mode=False, pipeline=None) ic15_rec_test = dict( type='OCRDataset', data_root=ic15_rec_data_root, - ann_file='test_labels.json', + ann_file='textrecog_test.json', + data_prefix=dict(img_path='crops/'), test_mode=True, pipeline=None) diff --git a/configs/textrecog/_base_/datasets/totaltext.py b/configs/textrecog/_base_/datasets/totaltext.py new file mode 100644 index 000000000..7eb7478a9 --- /dev/null +++ b/configs/textrecog/_base_/datasets/totaltext.py @@ -0,0 +1,17 @@ +ic15_rec_data_root = 'data/totaltext/' + +ic15_rec_train = dict( + type='OCRDataset', + data_root=ic15_rec_data_root, + ann_file='textrecog_train.json', + data_prefix=dict(img_path='crops/'), + test_mode=False, + pipeline=None) + +ic15_rec_test = dict( + type='OCRDataset', + data_root=ic15_rec_data_root, + ann_file='textrecog_test.json', + data_prefix=dict(img_path='crops/'), + test_mode=True, + pipeline=None) diff --git a/dataset_zoo/icdar2015/metafile.yml b/dataset_zoo/icdar2015/metafile.yml new file mode 100644 index 000000000..c04728754 --- /dev/null +++ b/dataset_zoo/icdar2015/metafile.yml @@ -0,0 +1,28 @@ +Name: 'Incidental Scene Text IC15' +Paper: + Title: ICDAR 2015 Competition on Robust Reading + URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf + Venue: ICDAR + Year: '2015' + BibTeX: '@inproceedings{karatzas2015icdar, + title={ICDAR 2015 competition on robust reading}, + author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others}, + booktitle={2015 13th international conference on document analysis and recognition (ICDAR)}, + pages={1156--1160}, + year={2015}, + organization={IEEE}}' +Data: + Website: https://rrc.cvc.uab.es/?ch=4 + Language: + - English + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py new file mode 100644 index 000000000..2c4310863 --- /dev/null +++ b/dataset_zoo/icdar2015/textdet.py @@ -0,0 +1,51 @@ +data_root = './data/icdar2015' +cache_path = './data/.cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip', + save_name='ic15_textdet_train_img.zip', + md5='c51cbace155dcc4d98c8dd19d378f30d', + split=['train'], + content=['image'], + mapping=[['ic15_textdet_train_img', 'imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', + save_name='ic15_textdet_test_img.zip', + md5='97e4c1ddcf074ffcc75feff2b63c35dd', + split=['test'], + content=['image'], + mapping=[['ic15_textdet_test_img', 'imgs/test']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch4_training_localization_transcription_gt.zip', + save_name='ic15_textdet_train_gt.zip', + md5='3bfaf1988960909014f7987d2343060b', + split=['train'], + content=['annotation'], + mapping=[['ic15_textdet_train_gt', 'annotations/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge4_Test_Task4_GT.zip', + save_name='ic15_textdet_test_gt.zip', + md5='8bce173b06d164b98c357b0eb96ef430', + split=['test'], + content=['annotation'], + mapping=[['ic15_textdet_test_gt', 'annotations/test']]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gather=dict( + type='pair_gather', + suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict(type='ICDAR2015TextDetParser'), + dumper=dict(type='JsonDumper'), + delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py new file mode 100644 index 000000000..febf6c4ae --- /dev/null +++ b/dataset_zoo/icdar2015/textrecog.py @@ -0,0 +1,42 @@ +data_root = './data/icdar2015' +cache_path = './data/.cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch4_training_word_images_gt.zip', + save_name='ic15_textrecog_train_img_gt.zip', + md5='600caf8c6a64a3dcf638839820edcca9', + split=['train'], + content=['image', 'annotation'], + mapping=[[ + 'ic15_textrecog_train_img_gt/gt.txt', 'annotations/train.txt' + ], ['ic15_textrecog_train_img_gt', 'crops/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip', + save_name='ic15_textrecog_test_img.zip', + md5='d7a71585f4cc69f89edbe534e7706d5d', + split=['test'], + content=['image'], + mapping=[['ic15_textrecog_test_img', 'crops/test']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge4_Test_Task3_GT.txt', + save_name='ic15_textrecog_test_gt.txt', + md5='d7a71585f4cc69f89edbe534e7706d5d', + split=['test'], + content=['annotation'], + mapping=[['ic15_textrecog_test_gt.txt', 'annotations/test.txt']]) + ]) + +data_converter = dict( + type='TextRecogDataConverter', + splits=['train', 'test'], + data_root=data_root, + gather=dict(type='mono_gather', mapping="f'{split}.txt'"), + parser=dict(type='ICDAR2015TextRecogParser'), + dumper=dict(type='JsonDumper')) diff --git a/dataset_zoo/icdar2015/textspotting.py b/dataset_zoo/icdar2015/textspotting.py new file mode 100644 index 000000000..413de5e88 --- /dev/null +++ b/dataset_zoo/icdar2015/textspotting.py @@ -0,0 +1,3 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') diff --git a/dataset_zoo/totaltext/metafile.yml b/dataset_zoo/totaltext/metafile.yml new file mode 100644 index 000000000..69777ee00 --- /dev/null +++ b/dataset_zoo/totaltext/metafile.yml @@ -0,0 +1,29 @@ +Name: 'Total Text' +Paper: + Title: "Total-Text: Towards Orientation Robustness in Scene Text Detection" + URL: https://link.springer.com/article/10.1007/s10032-019-00334-z + Venue: IJDAR + Year: '2020' + BibTeX: '@article{CK2019, + author = {Chee Kheng Chng and Chee Seng Chan and Chenglin Liu}, + title = {Total-Text: Towards Orientation Robustness in Scene Text Detection}, + journal = {International Journal on Document Analysis and Recognition (IJDAR)}, + volume = {23}, + pages = {31-52}, + year = {2020}, + doi = {10.1007/s10032-019-00334-z}}' +Data: + Website: https://github.com/cs-chan/Total-Text-Dataset + Language: + - English + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: BSD-3 + Link: https://github.com/cs-chan/Total-Text-Dataset/blob/master/LICENSE diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py new file mode 100644 index 000000000..8ad044989 --- /dev/null +++ b/dataset_zoo/totaltext/textdet.py @@ -0,0 +1,39 @@ +data_root = './data/totaltext' +cache_path = './data/.cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://universityofadelaide.box.com/shared/static/' + '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip', + save_name='totaltext.zip', + md5='5b56d71a4005a333cf200ff35ce87f75', + split=['train', 'test'], + content=['image'], + mapping=[['totaltext/Images/Train', 'imgs/train'], + ['totaltext/Images/Test', 'imgs/test']]), + dict( + url='https://universityofadelaide.box.com/shared/static/' + '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip', + save_name='txt_format.zip', + md5='97e4c1ddcf074ffcc75feff2b63c35dd', + split=['train', 'test'], + content=['annotation'], + mapping=[['txt_format/Train', 'annotations/train'], + ['txt_format/Test', 'annotations/test']]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gather=dict( + type='pair_gather', + suffixes=['.jpg', '.JPG'], + rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']), + parser=dict(type='TotaltextTextDetParser', data_root=data_root), + dumper=dict(type='JsonDumper'), + delete=['totaltext', 'txt_format', 'annotations']) diff --git a/dataset_zoo/totaltext/textrecog.py b/dataset_zoo/totaltext/textrecog.py new file mode 100644 index 000000000..e18f2f1f1 --- /dev/null +++ b/dataset_zoo/totaltext/textrecog.py @@ -0,0 +1,3 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextRecogCropConverter') diff --git a/dataset_zoo/totaltext/textspotting.py b/dataset_zoo/totaltext/textspotting.py new file mode 100644 index 000000000..413de5e88 --- /dev/null +++ b/dataset_zoo/totaltext/textspotting.py @@ -0,0 +1,3 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') diff --git a/dataset_zoo/wildreceipt/kie.py b/dataset_zoo/wildreceipt/kie.py new file mode 100644 index 000000000..6a1f3b613 --- /dev/null +++ b/dataset_zoo/wildreceipt/kie.py @@ -0,0 +1,32 @@ +data_root = './data/wildreceipt' +cache_path = './data/.cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/wildreceipt.tar', + save_name='wildreceipt.tar', + md5='2a2c4a1b4777fb4fe185011e17ad46ae', + split=['train', 'test'], + content=['image', 'annotation'], + mapping=[ + ['wildreceipt/wildreceipt/class_list.txt', 'class_list.txt'], + ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'], + ['wildreceipt/wildreceipt/test.txt', 'test.txt'], + ['wildreceipt/wildreceipt/train.txt', 'train.txt'], + ['wildreceipt/wildreceipt/image_files', 'image_files'], + ]), + ]) + +data_converter = dict( + type='WildReceiptConverter', + splits=['train', 'test'], + data_root=data_root, + gather=dict( + type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root), + parser=dict(type='WildreceiptKIEParser', data_root=data_root), + dumper=dict(type='WildreceiptOpensetDumper'), + delete=['wildreceipt']) diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml new file mode 100644 index 000000000..efc7f44eb --- /dev/null +++ b/dataset_zoo/wildreceipt/metafile.yml @@ -0,0 +1,29 @@ +Name: 'WildReceipt' +Paper: + Title: "Spatial Dual-Modality Graph Reasoning for Key Information Extraction" + URL: https://link.springer.com/article/10.1007/s10032-019-00334-z + Venue: arXiv + Year: '2021' + BibTeX: '@article{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Sun, Hongbin and Kuang, Zhanghui and Yue, Xiaoyu and Lin, Chenhao and Zhang, Wayne}, + journal={arXiv preprint arXiv:2103.14470}, + year={2021} +} +' +Data: + Website: https://github.com/cs-chan/Total-Text-Dataset + Language: + - English + Scene: + - Receipt + Granularity: + - Word + Tasks: + - kie + - textdet + - textrecog + - textspotting + License: + Type: N/A + Link: N/A diff --git a/dataset_zoo/wildreceipt/textdet.py b/dataset_zoo/wildreceipt/textdet.py new file mode 100644 index 000000000..cffb8d59d --- /dev/null +++ b/dataset_zoo/wildreceipt/textdet.py @@ -0,0 +1,6 @@ +_base_ = ['kie.py'] + +data_converter = dict( + type='TextDetDataConverter', + parser=dict(type='WildreceiptTextDetParser'), + dumper=dict(type='JsonDumper')) diff --git a/dataset_zoo/wildreceipt/textrecog.py b/dataset_zoo/wildreceipt/textrecog.py new file mode 100644 index 000000000..e18f2f1f1 --- /dev/null +++ b/dataset_zoo/wildreceipt/textrecog.py @@ -0,0 +1,3 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextRecogCropConverter') diff --git a/dataset_zoo/wildreceipt/textspotting.py b/dataset_zoo/wildreceipt/textspotting.py new file mode 100644 index 000000000..413de5e88 --- /dev/null +++ b/dataset_zoo/wildreceipt/textspotting.py @@ -0,0 +1,3 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') diff --git a/docs/en/index.rst b/docs/en/index.rst index 52de5b45a..2d68ce3f8 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -51,6 +51,7 @@ You can switch between English and Chinese in the lower-left corner of the layou :maxdepth: 2 :caption: Dataset Zoo + user_guides/data_prepare/dataset_preparer.md user_guides/data_prepare/det.md user_guides/data_prepare/recog.md user_guides/data_prepare/kie.md diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md new file mode 100644 index 000000000..d95b51359 --- /dev/null +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -0,0 +1,39 @@ +# Dataset Preparer + +## One-click data preparation script + +MMOCR provides a unified one-stop data preparation script `prepare_dataset.py`. + +Only one line of command is needed to complete the data download, decompression, and format conversion. + +```bash +python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK] --nproc [$NPROC] +``` + +| ARGS | Type | Description | +| ------------ | ---- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| dataset_name | str | (required) dataset name. | +| --task | str | Convert the dataset to the format of a specified task supported by MMOCR. options are: 'textdet', 'textrecog', 'textspotting', and 'kie'. | +| --nproc | int | Number of processors to be used. Defaults to 4. | + +For example, the following command shows how to use the script to prepare the ICDAR2015 dataset for text detection task. + +```bash +python tools/dataset_converters/prepare_dataset.py icdar2015 --task textdet +``` + +Also, the script supports preparing multiple datasets at the same time. For example, the following command shows how to prepare the ICDAR2015 and TotalText datasets for text recognition task. + +```bash +python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog +``` + +The following table shows the supported datasets. + +| Dataset Name | Text Detection | Text Recognition | Text Spotting | KIE | +| ------------ | -------------- | ---------------- | ------------- | --- | +| icdar2015 | ✓ | ✓ | ✓ | | +| totaltext | ✓ | ✓ | ✓ | | +| wildreceipt | ✓ | ✓ | ✓ | ✓ | + +## Advanced Usage\[Coming Soon\] diff --git a/docs/en/user_guides/data_prepare/det.md b/docs/en/user_guides/data_prepare/det.md index a869f3f74..f734b9ea1 100644 --- a/docs/en/user_guides/data_prepare/det.md +++ b/docs/en/user_guides/data_prepare/det.md @@ -1,4 +1,8 @@ -# Text Detection +# Text Detection\[Deprecated\] + +```{warning} +This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +``` ## Overview diff --git a/docs/en/user_guides/data_prepare/kie.md b/docs/en/user_guides/data_prepare/kie.md index cbbc82904..4e1d59dcd 100644 --- a/docs/en/user_guides/data_prepare/kie.md +++ b/docs/en/user_guides/data_prepare/kie.md @@ -1,4 +1,8 @@ -# Key Information Extraction +# Key Information Extraction\[Deprecated\] + +```{warning} +This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +``` ## Overview diff --git a/docs/en/user_guides/data_prepare/recog.md b/docs/en/user_guides/data_prepare/recog.md index 3a1d57938..63b42ad21 100644 --- a/docs/en/user_guides/data_prepare/recog.md +++ b/docs/en/user_guides/data_prepare/recog.md @@ -1,4 +1,8 @@ -# Text Recognition +# Text Recognition\[Deprecated\] + +```{warning} +This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +``` ## Overview diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index c8732d3ef..a2761dbdd 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -51,6 +51,7 @@ :maxdepth: 2 :caption: 数据集支持 + user_guides/data_prepare/dataset_preparer.md user_guides/data_prepare/det.md user_guides/data_prepare/recog.md user_guides/data_prepare/kie.md diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md new file mode 100644 index 000000000..01497d8f6 --- /dev/null +++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md @@ -0,0 +1,39 @@ +# 数据准备 + +## 一键式数据准备脚本 + +MMOCR 提供了统一的一站式数据集准备脚本 `prepare_dataset.py`。 + +仅需一行命令即可完成数据的下载、解压,以及格式转换。 + +```bash +python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK] --nproc [$NPROC] +``` + +| 参数 | 类型 | 说明 | +| ------------ | ---- | ----------------------------------------------------------------------------------------------------- | +| dataset_name | str | (必须)需要准备的数据集名称。 | +| --task | str | 将数据集格式转换为指定任务的 MMOCR 格式。可选项为: 'textdet', 'textrecog', 'textspotting' 和 'kie'。 | +| --nproc | str | 使用的进程数,默认为 4。 | + +例如,以下命令展示了如何使用该脚本为 ICDAR2015 数据集准备文本检测任务所需的数据。 + +```bash +python tools/dataset_converters/prepare_dataset.py icdar2015 --task textdet +``` + +该脚本也支持同时准备多个数据集,例如,以下命令展示了如何使用该脚本同时为 ICDAR2015 和 TotalText 数据集准备文本识别任务所需的数据。 + +```bash +python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog +``` + +下表展示了目前支持一键下载及格式转换的数据集。 + +| 数据集名称 | 文本检测任务 | 文本识别任务 | 端到端文本检测识别任务 | 关键信息抽取任务 | +| ----------- | ------------ | ------------ | ---------------------- | ---------------- | +| icdar2015 | ✓ | ✓ | ✓ | | +| totaltext | ✓ | ✓ | ✓ | | +| wildreceipt | ✓ | ✓ | ✓ | ✓ | + +## 进阶用法\[待更新\] diff --git a/docs/zh_cn/user_guides/data_prepare/det.md b/docs/zh_cn/user_guides/data_prepare/det.md index a4a7d81a4..dc53794a0 100644 --- a/docs/zh_cn/user_guides/data_prepare/det.md +++ b/docs/zh_cn/user_guides/data_prepare/det.md @@ -1,4 +1,8 @@ -# 文字检测 +# 文字检测\[过时\] + +```{warning} +该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 +``` ## 概览 diff --git a/docs/zh_cn/user_guides/data_prepare/kie.md b/docs/zh_cn/user_guides/data_prepare/kie.md index 51b5e962c..2e96cae55 100644 --- a/docs/zh_cn/user_guides/data_prepare/kie.md +++ b/docs/zh_cn/user_guides/data_prepare/kie.md @@ -1,4 +1,8 @@ -# 关键信息提取 +# 关键信息提取\[过时\] + +```{warning} +该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 +``` ## 概览 diff --git a/docs/zh_cn/user_guides/data_prepare/recog.md b/docs/zh_cn/user_guides/data_prepare/recog.md index ce6679b90..3925ca42a 100644 --- a/docs/zh_cn/user_guides/data_prepare/recog.md +++ b/docs/zh_cn/user_guides/data_prepare/recog.md @@ -1,7 +1,7 @@ -# 文字识别 +# 文字识别\[过时\] ```{warning} -该章节翻译落后于[英文版文档](../../en/user_guides/../../user_guides/data_prepare/recog.md)。 +该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 ``` ## 概览 diff --git a/mmocr/datasets/preparers/__init__.py b/mmocr/datasets/preparers/__init__.py new file mode 100644 index 000000000..a104478a6 --- /dev/null +++ b/mmocr/datasets/preparers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_converter import (TextDetDataConverter, TextRecogDataConverter, + TextSpottingDataConverter, WildReceiptConverter) +from .data_obtainer import NaiveDataObtainer +from .data_preparer import DatasetPreparer +from .dumpers import * # noqa +from .parsers import * # noqa + +__all__ = [ + 'DatasetPreparer', 'NaiveDataObtainer', 'TextDetDataConverter', + 'TextRecogDataConverter', 'TextSpottingDataConverter', + 'WildReceiptConverter' +] diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py new file mode 100644 index 000000000..26f37e327 --- /dev/null +++ b/mmocr/datasets/preparers/data_converter.py @@ -0,0 +1,629 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import re +import shutil +from abc import abstractmethod +from functools import partial +from typing import Dict, List, Optional, Sequence, Tuple + +import mmcv +from mmengine import mkdir_or_exist, track_parallel_progress + +from mmocr.utils import bbox2poly, crop_img, poly2bbox, retrieve_files +from .data_preparer import DATA_CONVERTER, DATA_DUMPER, DATA_PARSER + + +class BaseDataConverter: + """Base class for data processor. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset files. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + task (str): Task of the dataset. + delete (Optional[List]): A list of files to be deleted after + conversion. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + task: str, + delete: Optional[List] = None): + assert isinstance(nproc, int) and nproc > 0, \ + 'nproc must be a positive integer.' + self.splits = splits + self.data_root = data_root + self.nproc = nproc + self.task = task + self.delete = delete + parser.update(dict(nproc=nproc)) + dumper.update(dict(task=task)) + self.parser = DATA_PARSER.build(parser) + self.dumper = DATA_DUMPER.build(dumper) + gather_type = gather.pop('type') + self.gather_args = gather + if gather_type == 'pair_gather': + self.gather = self.pair_gather + elif gather_type == 'mono_gather': + self.gather = self.mono_gather + else: + raise NotImplementedError + + def __call__(self): + """Process the data.""" + # Convert and dump annotations to MMOCR format + for split in self.splits: + print(f'Parsing {split} split...') + # Gather the info such as file names required by parser + img_path = osp.join(self.data_root, 'imgs', split) + ann_path = osp.join(self.data_root, 'annotations') + gather_args = dict( + img_path=img_path, ann_path=ann_path, split=split) + gather_args.update(self.gather_args) + files = self.gather(**gather_args) + # Convert dataset annotations to MMOCR format + samples = self.parser.parse_files(files, split) + print(f'Packing {split} annotations...') + func = partial(self.pack_instance, split=split) + samples = track_parallel_progress(func, samples, nproc=self.nproc) + samples = self.add_meta(samples) + # Dump annotation files + self.dumper.dump(samples, self.data_root, split) + self.clean() + + @abstractmethod + def pack_instance(self, sample: Tuple, split: str) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, ann_file). + - img_path (str): Path to image file. + - instances (Sequence[Dict]): A list of converted annos. + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + @abstractmethod + def add_meta(self, sample: Dict) -> Dict: + """Add meta information to the sample. + + Args: + sample (Dict): A sample of the dataset. + + Returns: + Dict: A sample with meta information. + """ + + def mono_gather(self, ann_path: str, mapping: str, split: str, + **kwargs) -> str: + """Gathering the dataset file. Specifically for the case that only one + annotation file is needed. For example, + + img_001.jpg \ + img_002.jpg ---> train.json + img_003.jpg / + + Args: + anno_path (str): Path to the annotations. + mapping (str): Mapping rule of the annotation names. For example, + "f'{split}.json'" will return 'train.json' when the split is + 'train'. + split (str): The current split. + + Returns: + str: Path to the annotation file. + """ + + return osp.join(ann_path, eval(mapping)) + + def pair_gather(self, img_path, suffixes, rule: Sequence, + **kwargs) -> List[Tuple]: + """Gathering the dataset files. Specifically for the paired + annotations. That is to say, each image has a corresponding annotation + file. For example, + + img_001.jpg <---> gt_img_001.txt + img_002.jpg <---> gt_img_002.txt + img_003.jpg <---> gt_img_003.txt + + Args: + img_path (str): Path to the images. + suffixes (List[str]): File suffixes that used for searching. + rule (Sequence): The rule for pairing the files. The + first element is the matching pattern for the file, and the + second element is the replacement pattern. + + Returns: + List[Tuple]: A list of tuples (img_path, ann_path). + """ + files = list() + for file in retrieve_files(img_path, suffixes): + file2 = re.sub(rule[0], rule[1], osp.basename(file)) + file2 = file.replace(osp.basename(file), file2) + file2 = file2.replace('imgs', 'annotations') + files.append((file, file2)) + + return files + + def clean(self) -> None: + for d in self.delete: + delete_file = osp.join(self.data_root, d) + if osp.exists(delete_file): + shutil.rmtree(delete_file) + + +@DATA_CONVERTER.register_module() +class TextDetDataConverter(BaseDataConverter): + """Text detection data converter. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset files. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + delete (Optional[List]): A list of files to be deleted after + conversion. Defaults to ['annotations]. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + delete: List = ['annotations']) -> None: + super().__init__( + splits=splits, + data_root=data_root, + gather=gather, + parser=parser, + dumper=dumper, + nproc=nproc, + delete=delete, + task='textdet') + + def pack_instance(self, + sample: Tuple, + split: str, + bbox_label: int = 0) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, ann_file). + - img_path (str): Path to image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + - 'poly' or 'box' + - ignore + - bbox_label (optional) + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + packed_instances = list() + for instance in instances: + poly = instance.get('poly', None) + box = instance.get('box', None) + assert box or poly + packed_sample = dict( + polygon=poly if poly else list( + bbox2poly(box).astype('float64')), + bbox=box if box else list(poly2bbox(poly).astype('float64')), + bbox_label=bbox_label, + ignore=instance['ignore']) + packed_instances.append(packed_sample) + + packed_instances = dict( + instances=packed_instances, + img_path=img_path.replace(self.data_root, ''), + height=h, + width=w) + + return packed_instances + + def add_meta(self, sample: Dict) -> Dict: + meta = { + 'metainfo': { + 'dataset_type': 'TextDetDataset', + 'task_name': 'textdet', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': sample + } + return meta + + +@DATA_CONVERTER.register_module() +class TextSpottingDataConverter(BaseDataConverter): + """Text spotting data converter. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset files. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + delete (Optional[List]): A list of files to be deleted after + conversion. Defaults to ['annotations]. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + delete: List = ['annotations']) -> None: + super().__init__( + splits=splits, + data_root=data_root, + gather=gather, + parser=parser, + dumper=dumper, + nproc=nproc, + delete=delete, + task='textspotting') + + def pack_instance(self, + sample: Tuple, + split: str, + bbox_label: int = 0) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, ann_file). + - img_path (str): Path to image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + - 'poly' or 'box' + - ignore + - bbox_label (optional) + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + packed_instances = list() + for instance in instances: + assert 'text' in instance, 'Text is not found in the instance.' + poly = instance.get('poly', None) + box = instance.get('box', None) + assert box or poly + packed_sample = dict( + polygon=poly if poly else list( + bbox2poly(box).astype('float64')), + bbox=box if box else list(poly2bbox(poly).astype('float64')), + bbox_label=bbox_label, + ignore=instance['ignore'], + text=instance['text']) + packed_instances.append(packed_sample) + + packed_instances = dict( + instances=packed_instances, img_path=img_path, height=h, width=w) + + return packed_instances + + def add_meta(self, sample: Dict) -> Dict: + meta = { + 'metainfo': { + 'dataset_type': 'TextSpottingDataset', + 'task_name': 'textspotting', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': sample + } + return meta + + +@DATA_CONVERTER.register_module() +class TextRecogDataConverter(BaseDataConverter): + """Text recognition data converter. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset annotations. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + delete (Optional[List]): A list of files to be deleted after + conversion. Defaults to ['annotations]. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + delete: List = ['annotations']): + super().__init__( + splits=splits, + data_root=data_root, + gather=gather, + parser=parser, + dumper=dumper, + nproc=nproc, + task='textrecog', + delete=delete) + + def pack_instance(self, sample: Tuple, split: str) -> Dict: + """Pack the text info to a recognition instance. + + Args: + samples (Tuple): A tuple of (img_name, text). + split (str): The split of the instance. + + Returns: + Dict: The packed instance. + """ + + def pack(img_name: str, text: str, split: str) -> Dict: + return dict( + instances=[dict(text=text)], + img_path=osp.join(split, img_name)) + + img_name, text = sample + + return pack(img_name, text, split) + + def add_meta(self, sample: Dict) -> Dict: + meta = { + 'metainfo': { + 'dataset_type': 'TextRecogDataset', + 'task_name': 'textrecog', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': sample + } + return meta + + +@DATA_CONVERTER.register_module() +class TextRecogCropConverter(TextRecogDataConverter): + """Text recognition crop converter. This converter will crop the text from + the original image. The parser used for this Converter should be a TextDet + parser. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset annotations. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + long_edge_pad_ratio (float): The ratio of padding the long edge of the + cropped image. Defaults to 0.1. + short_edge_pad_ratio (float): The ratio of padding the short edge of + the cropped image. Defaults to 0.05. + delete (Optional[List]): A list of files to be deleted after + conversion. Defaults to ['annotations]. + crop_save_dir (str): The directory to save the cropped images. + Defaults to 'crops'. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + long_edge_pad_ratio: float = 0.1, + short_edge_pad_ratio: float = 0.05, + delete: List = ['annotations'], + crop_save_path: str = 'crops'): + super().__init__( + splits=splits, + data_root=data_root, + gather=gather, + parser=parser, + dumper=dumper, + nproc=nproc, + delete=delete) + self.ignore = self.parser.ignore + self.lepr = long_edge_pad_ratio + self.sepr = short_edge_pad_ratio + self.crop_save_path = osp.join(self.data_root, crop_save_path) + mkdir_or_exist(self.crop_save_path) + for split in splits: + mkdir_or_exist(osp.join(self.crop_save_path, split)) + + def pack_instance(self, sample: Tuple, split: str) -> List: + """Crop patches from image. + + Args: + samples (Tuple): A tuple of (img_name, text). + split (str): The split of the instance. + + Return: + List: The list of cropped patches. + """ + + def get_box(instance: Dict) -> List: + if 'box' in instance: + return bbox2poly(instance['box']).tolist() + if 'poly' in instance: + return bbox2poly(poly2bbox(instance['poly'])).tolist() + + data_list = [] + img_path, instances = sample + img = mmcv.imread(img_path) + for i, instance in enumerate(instances): + box, text = get_box(instance), instance['text'] + if text == self.ignore: + continue + patch = crop_img(img, box, self.lepr, self.sepr) + if patch.shape[0] == 0 or patch.shape[1] == 0: + continue + patch_name = osp.splitext( + osp.basename(img_path))[0] + f'_{i}' + osp.splitext( + osp.basename(img_path))[1] + dst_path = osp.join(self.crop_save_path, split, patch_name) + mmcv.imwrite(patch, dst_path) + rec_instance = dict( + instances=[dict(text=text)], img_path=f'{split}/{patch_name}') + data_list.append(rec_instance) + + return data_list + + +@DATA_CONVERTER.register_module() +class WildReceiptConverter(BaseDataConverter): + """MMOCR only supports wildreceipt dataset for KIE task now. This converter + converts the wildreceipt dataset from close set to open set. + + Args: + splits (List): A list of splits to be processed. + data_root (str): Path to the data root. + gather (Dict): Config dict for gathering the dataset files. + parser (Dict): Config dict for parsing the dataset annotations. + dumper (Dict): Config dict for dumping the dataset files. + nproc (int): Number of processes to process the data. + delete (Optional[List]): A list of files to be deleted after + conversion. Defaults to ['annotations]. + merge_bg_others (bool): If True, give the same label to "background" + class and "others" class. Defaults to True. + ignore_idx (int): Index for ``ignore`` class. Defaults to 0. + others_idx (int): Index for ``others`` class. Defaults to 25. + """ + + def __init__(self, + splits: List, + data_root: str, + gather: Dict, + parser: Dict, + dumper: Dict, + nproc: int, + delete: Optional[List] = None, + merge_bg_others: bool = False, + ignore_idx: int = 0, + others_idx: int = 25): + self.ignore_idx = ignore_idx + self.others_idx = others_idx + self.merge_bg_others = merge_bg_others + parser.update(dict(ignore=ignore_idx)) + super().__init__( + splits=splits, + data_root=data_root, + gather=gather, + parser=parser, + dumper=dumper, + nproc=nproc, + task='kie', + delete=delete) + + def pack_instance(self, sample: str, split: str): + """Pack line-json str of close set to line-json str of open set. + + Args: + sample (str): The string to be deserialized to + the close set dictionary object. + split (str): The split of the instance. + """ + # Two labels at the same index of the following two lists + # make up a key-value pair. For example, in wildreceipt, + # closeset_key_inds[0] maps to "Store_name_key" + # and closeset_value_inds[0] maps to "Store_addr_value". + closeset_key_inds = list(range(2, self.others_idx, 2)) + closeset_value_inds = list(range(1, self.others_idx, 2)) + + openset_node_label_mapping = { + 'bg': 0, + 'key': 1, + 'value': 2, + 'others': 3 + } + if self.merge_bg_others: + openset_node_label_mapping['others'] = openset_node_label_mapping[ + 'bg'] + + closeset_obj = json.loads(sample) + openset_obj = { + 'file_name': closeset_obj['file_name'], + 'height': closeset_obj['height'], + 'width': closeset_obj['width'], + 'annotations': [] + } + + edge_idx = 1 + label_to_edge = {} + for anno in closeset_obj['annotations']: + label = anno['label'] + if label == self.ignore_idx: + anno['label'] = openset_node_label_mapping['bg'] + anno['edge'] = edge_idx + edge_idx += 1 + elif label == self.others_idx: + anno['label'] = openset_node_label_mapping['others'] + anno['edge'] = edge_idx + edge_idx += 1 + else: + edge = label_to_edge.get(label, None) + if edge is not None: + anno['edge'] = edge + if label in closeset_key_inds: + anno['label'] = openset_node_label_mapping['key'] + elif label in closeset_value_inds: + anno['label'] = openset_node_label_mapping['value'] + else: + tmp_key = 'key' + if label in closeset_key_inds: + label_with_same_edge = closeset_value_inds[ + closeset_key_inds.index(label)] + elif label in closeset_value_inds: + label_with_same_edge = closeset_key_inds[ + closeset_value_inds.index(label)] + tmp_key = 'value' + edge_counterpart = label_to_edge.get( + label_with_same_edge, None) + if edge_counterpart is not None: + anno['edge'] = edge_counterpart + else: + anno['edge'] = edge_idx + edge_idx += 1 + anno['label'] = openset_node_label_mapping[tmp_key] + label_to_edge[label] = anno['edge'] + + openset_obj['annotations'] = closeset_obj['annotations'] + + return json.dumps(openset_obj, ensure_ascii=False) diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py new file mode 100644 index 000000000..13311e790 --- /dev/null +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import shutil +import ssl +import urllib.request as request +from typing import Dict, List, Optional, Tuple + +from mmengine import mkdir_or_exist + +from mmocr.utils import check_integrity, iszip +from .data_preparer import DATA_OBTAINER + +ssl._create_default_https_context = ssl._create_unverified_context + + +@DATA_OBTAINER.register_module() +class NaiveDataObtainer: + """A naive pipeline for obtaining dataset. + + download -> extract -> move + + Args: + files (list[dict]): A list of file information. + cache_path (str): The path to cache the downloaded files. + data_root (str): The root path of the dataset. + """ + + def __init__(self, files: List[Dict], cache_path: str, + data_root: str) -> None: + self.files = files + self.cache_path = cache_path + self.data_root = data_root + mkdir_or_exist(osp.join(self.data_root, 'imgs')) + mkdir_or_exist(osp.join(self.data_root, 'annotations')) + mkdir_or_exist(self.cache_path) + + def __call__(self): + for file in self.files: + save_name, url, md5 = file['save_name'], file['url'], file['md5'] + download_path = osp.join( + self.cache_path, + osp.basename(url) if save_name is None else save_name) + # Download required files + if not check_integrity(download_path, md5): + self.download(url=url, dst_path=download_path) + # Extract downloaded zip files to data root + self.extract(src_path=download_path, dst_path=self.data_root) + # Move & Rename dataset files + if 'mapping' in file: + self.move(mapping=file['mapping']) + self.clean() + + def download(self, url: Optional[str], dst_path: str) -> None: + """Download file from given url with progress bar. + + Args: + url (str): The url to download the file. + dst_path (str): The destination path to save the file. + """ + + def progress(down: float, block: float, size: float) -> None: + """Show download progress. + + Args: + down (float): Downloaded size. + block (float): Block size. + size (float): Total size of the file. + """ + + percent = min(100. * down * block / size, 100) + file_name = osp.basename(dst_path) + print(f'\rDownloading {file_name}: {percent:.2f}%', end='') + + if not url and not osp.exists(dst_path): + raise FileNotFoundError( + 'Direct url is not available for this dataset.' + ' Please manually download the required files' + ' following the guides.') + + request.urlretrieve(url, dst_path, progress) + + def extract(self, + src_path: str, + dst_path: str, + delete: bool = False) -> None: + """Extract zip/tar.gz files. + + Args: + src_path (str): Path to the zip file. + dst_path (str): Path to the destination folder. + delete (bool, optional): Whether to delete the zip file. Defaults + to False. + """ + + if not iszip(src_path): + # Move the file to the destination folder if it is not a zip + shutil.move(src_path, dst_path) + return + + zip_name = osp.basename(src_path).split('.')[0] + if dst_path is None: + dst_path = osp.join(osp.dirname(src_path), zip_name) + else: + dst_path = osp.join(dst_path, zip_name) + mkdir_or_exist(dst_path) + print(f'Extracting: {osp.basename(src_path)}') + if src_path.endswith('.zip'): + try: + import zipfile + except ImportError: + raise ImportError( + 'Please install zipfile by running "pip install zipfile".') + with zipfile.ZipFile(src_path, 'r') as zip_ref: + zip_ref.extractall(dst_path) + elif src_path.endswith('.tar.gz') or src_path.endswith('.tar'): + if src_path.endswith('.tar.gz'): + mode = 'r:gz' + elif src_path.endswith('.tar'): + mode = 'r:' + try: + import tarfile + except ImportError: + raise ImportError( + 'Please install tarfile by running "pip install tarfile".') + with tarfile.open(src_path, mode) as tar_ref: + tar_ref.extractall(dst_path) + if delete: + os.remove(src_path) + + def move(self, mapping: List[Tuple[str, str]]) -> None: + """Rename and move dataset files one by one. + + Args: + mapping (List[Tuple[str, str]]): A list of tuples, each + tuple contains the source file name and the destination file name. + """ + for src, dst in mapping: + src = osp.join(self.data_root, src) + dst = osp.join(self.data_root, dst) + if osp.exists(src) and not osp.exists(dst): + shutil.move(src, dst) + + def clean(self) -> None: + """Remove empty dirs.""" + for root, dirs, files in os.walk(self.data_root, topdown=False): + if not files and not dirs: + os.rmdir(root) diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py new file mode 100644 index 000000000..094a7c056 --- /dev/null +++ b/mmocr/datasets/preparers/data_preparer.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import time + +from mmengine import Registry +from mmengine.config import Config + +DATA_OBTAINER = Registry('data_obtainer') +DATA_CONVERTER = Registry('data_converter') +DATA_PARSER = Registry('data_parser') +DATA_DUMPER = Registry('data_dumper') + + +class DatasetPreparer: + """Base class of dataset preparer. + + Dataset preparer is used to prepare dataset for MMOCR. It mainly consists + of two steps: + + 1. Obtaining the dataset + - Download + - Extract + - Move/Rename + 2. Process the dataset + - Convert to mmocr format + - Dump the annotation file + - Clean useless files + + After all these steps, the original datasets have been prepared for + usage in MMOCR. Check out the dataset format used in MMOCR here: + https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html + """ + + def __init__(self, + cfg_path: str, + dataset_name: str, + task: str = 'textdet', + nproc: int = 4) -> None: + """Initialization. Load necessary meta info and print license. + + Args: + cfg_path (str): Path to dataset config file. + dataset_name (str): Dataset name. + task (str): Task type. Options are 'textdet', 'textrecog', + 'textspotter', and 'kie'. Defaults to 'textdet'. + nproc (int): Number of parallel processes. Defaults to 4. + """ + cfg_path = osp.join(cfg_path, dataset_name) + self.nproc = nproc + self.task = task + self.parse_meta(cfg_path) + self.parse_cfg(cfg_path) + + def __call__(self): + """Prepare the dataset.""" + if self.with_obtainer: + self.data_obtainer() + if self.with_processor: + self.data_converter() + + def parse_meta(self, cfg_path: str) -> None: + """Parse meta file. + + Args: + cfg_path (str): Path to meta file. + """ + meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml')) + assert self.task in meta['Data']['Tasks'], \ + f'Task {self.task} not supported!' + # License related + if meta['Data']['License']['Type']: + print(f"\033[1;33;40mDataset Name: {meta['Name']}") + print(f"License Type: {meta['Data']['License']['Type']}") + print(f"License Link: {meta['Data']['License']['Link']}") + print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m") + print( + '\033[1;31;43mMMOCR does not own the dataset. Using this ' + 'dataset you must accept the license provided by the owners, ' + 'and cite the corresponding papers appropriately.') + print('If you do not agree with the above license, please cancel ' + 'the progress immediately by pressing ctrl+c. Otherwise, ' + 'you are deemed to accept the terms and conditions.\033[0m') + for i in range(5): + print(f'{5-i}...') + time.sleep(1) + + def parse_cfg(self, cfg_path: str) -> None: + """Parse dataset config file. + + Args: + cfg_path (str): Path to dataset config file. + """ + cfg = Config.fromfile(osp.join(cfg_path, self.task + '.py')) + + if 'data_obtainer' in cfg: + self.data_obtainer = DATA_OBTAINER.build(cfg.data_obtainer) + if 'data_converter' in cfg: + cfg.data_converter.update(dict(nproc=self.nproc)) + self.data_converter = DATA_CONVERTER.build(cfg.data_converter) + + @property + def with_obtainer(self) -> bool: + """bool: whether the data preparer has an obtainer""" + return hasattr(self, + 'data_obtainer') and self.data_obtainer is not None + + @property + def with_processor(self) -> bool: + """bool: whether the data preparer has an obtainer""" + return hasattr(self, + 'data_converter') and self.data_converter is not None diff --git a/mmocr/datasets/preparers/dumpers/__init__.py b/mmocr/datasets/preparers/dumpers/__init__.py new file mode 100644 index 000000000..4dc93d9c8 --- /dev/null +++ b/mmocr/datasets/preparers/dumpers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dumpers import JsonDumper, WildreceiptOpensetDumper + +__all__ = ['JsonDumper', 'WildreceiptOpensetDumper'] diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py new file mode 100644 index 000000000..c6fe205c0 --- /dev/null +++ b/mmocr/datasets/preparers/dumpers/dumpers.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List + +import mmengine + +from mmocr.utils import list_to_file +from ..data_preparer import DATA_DUMPER + + +@DATA_DUMPER.register_module() +class JsonDumper: + + def __init__(self, task: str) -> None: + self.task = task + self.format = format + + def dump(self, data: List, data_root: str, split: str) -> None: + dst_file = osp.join(data_root, f'{self.task}_{split}.json') + mmengine.dump(data, dst_file) + + +@DATA_DUMPER.register_module() +class WildreceiptOpensetDumper: + + def __init__(self, task: str) -> None: + self.task = task + + def dump(self, data: List, data_root: str, split: str) -> None: + list_to_file(osp.join(data_root, f'openset_{split}.txt'), data) diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py new file mode 100644 index 000000000..66070ccc2 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ic15_parser import ICDAR2015TextDetParser, ICDAR2015TextRecogParser +from .totaltext_parser import TotaltextTextDetParser +from .wildreceipt import WildreceiptKIEParser + +__all__ = [ + 'ICDAR2015TextDetParser', 'ICDAR2015TextRecogParser', + 'TotaltextTextDetParser', 'WildreceiptKIEParser' +] diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py new file mode 100644 index 000000000..83ef704a4 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/base.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod +from functools import partial +from typing import Dict, List, Optional, Tuple + +from mmengine import track_parallel_progress + + +class BaseParser: + + def __init__(self, + data_root: Optional[str] = None, + nproc: int = 1) -> None: + self.data_root = data_root + self.nproc = nproc + + def __call__(self, files: List[Tuple], split: str) -> List: + samples = self.parse_files(files, split) + return samples + + def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: + """Convert annotations to MMOCR format. + + Args: + files (Tuple): A tuple of path to image and annotation. + + Returns: + List[Tuple]: A list of a tuple of (image_path, instance) + """ + func = partial(self.parse_file, split=split) + samples = track_parallel_progress(func, files, nproc=self.nproc) + return samples + + @abstractmethod + def parse_file(self, file: Tuple, split: str) -> Dict: + """Convert annotation for a single image.""" + raise NotImplementedError diff --git a/mmocr/datasets/preparers/parsers/ic15_parser.py b/mmocr/datasets/preparers/parsers/ic15_parser.py new file mode 100644 index 000000000..b40cdb26c --- /dev/null +++ b/mmocr/datasets/preparers/parsers/ic15_parser.py @@ -0,0 +1,73 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +from ..data_preparer import DATA_PARSER +from .base import BaseParser +from .loaders import txt_loader + + +@DATA_PARSER.register_module() +class ICDAR2015TextDetParser(BaseParser): + """ICDAR2015 Text Detection Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following format: x1, y1, x2, y2, x3, y3, x4, y4, + transcription + """ + + def __init__(self, + separator: str = ',', + ignore: str = '###', + format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', + encoding: str = 'utf-8-sig', + nproc: int = 1) -> None: + self.sep = separator + self.format = format + self.encoding = encoding + self.ignore = ignore + super().__init__(nproc=nproc) + + def parse_file(self, file: Tuple, split: str) -> Dict: + """Parse single annotation.""" + img_file, txt_file = file + instances = list() + for anno in txt_loader(txt_file, self.sep, self.format, self.encoding): + anno = list(anno.values()) + poly = list(map(float, anno[0:-1])) + text = anno[-1] + instances.append( + dict(poly=poly, text=text, ignore=text == self.ignore)) + + return img_file, instances + + +@DATA_PARSER.register_module() +class ICDAR2015TextRecogParser(BaseParser): + """ICDAR2015 Text Detection Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following format: img_path, transcription + """ + + def __init__(self, + separator: str = ',', + ignore: str = '#', + format: str = 'img,text', + encoding: str = 'utf-8-sig', + nproc: int = 1) -> None: + self.sep = separator + self.format = format + self.encoding = encoding + self.ignore = ignore + super().__init__(nproc=nproc) + + def parse_files(self, files: str, split: str) -> List: + """Parse annotations.""" + assert isinstance(files, str) + samples = list() + for anno in txt_loader( + file_path=files, format=self.format, encoding=self.encoding): + text = anno['text'].strip().replace('"', '') + samples.append((anno['img'], text)) + + return samples diff --git a/mmocr/datasets/preparers/parsers/loaders.py b/mmocr/datasets/preparers/parsers/loaders.py new file mode 100644 index 000000000..90145b6b5 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/loaders.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re +from typing import Dict, Tuple, Union + +import yaml + + +def txt_loader(file_path: str, + separator: str = ',', + format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', + encoding='utf-8') -> Union[Dict, str]: + """Loading txt format annotations. + + Args: + file_path (str): Path to the txt file. + separator (str, optional): Separator of data. Defaults to ','. + format (str, optional): Annotation format. + Defaults to 'x1,y1,x2,y2,x3,y3,x4,y4,trans'. + encoding (str, optional): Encoding format. Defaults to 'utf-8'. + + Yields: + Iterator[Union[Dict, str]]: Original text line or a dict containing + the information of the text line. + """ + keys = format.split(separator) + with open(file_path, 'r', encoding=encoding) as f: + for line in f.readlines(): + line = line.strip() + if line: + yield dict(zip(keys, line.split(separator))) + + +def totaltext_loader(file_path: str) -> str: + """The annotation of the totaltext dataset may be stored in multiple lines, + this loader is designed for this special case. + + Args: + file_path (str): Path to the txt file + + Yield: + str: Complete annotation of the txt file + """ + + def parsing_line(line: str) -> Tuple: + """Parsing a line of the annotation. + + Args: + line (str): A line of the annotation. + + Returns: + Tuple: A tuple of (polygon, transcription). + """ + line = '{' + line.replace('[[', '[').replace(']]', ']') + '}' + ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line) + ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict) + ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict) + ann_dict = ann_dict.replace("[u',']", "[u'#']") + ann_dict = yaml.safe_load(ann_dict) + + # polygon + xs, ys = ann_dict['x'], ann_dict['y'] + poly = [] + for x, y in zip(xs, ys): + poly.append(x) + poly.append(y) + # text + text = ann_dict['transcriptions'] + if len(text) == 0: + text = '#' + else: + word = text[0] + if len(text) > 1: + for ann_word in text[1:]: + word += ',' + ann_word + text = str(eval(word)) + + return poly, text + + with open(file_path, 'r') as f: + for idx, line in enumerate(f): + line = line.strip() + if idx == 0: + tmp_line = line + continue + if not line.startswith('x:'): + tmp_line += ' ' + line + continue + complete_line = tmp_line + tmp_line = line + yield parsing_line(complete_line) + + if tmp_line != '': + yield parsing_line(tmp_line) diff --git a/mmocr/datasets/preparers/parsers/totaltext_parser.py b/mmocr/datasets/preparers/parsers/totaltext_parser.py new file mode 100644 index 000000000..199d5258d --- /dev/null +++ b/mmocr/datasets/preparers/parsers/totaltext_parser.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +from ..data_preparer import DATA_PARSER +from .base import BaseParser +from .loaders import totaltext_loader + + +@DATA_PARSER.register_module() +class TotaltextTextDetParser(BaseParser): + """TotalText Text Detection Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following format: + x: [[x1 x2 x3 ... xn]], y: [[y1 y2 y3 ... yn]], + ornt: [u'c'], transcriptions: [u'transcription'] + """ + + def __init__(self, + data_root: str, + ignore: str = '#', + nproc: int = 1) -> None: + self.ignore = ignore + super().__init__(data_root=data_root, nproc=nproc) + + def parse_file(self, file: Tuple, split: str) -> Dict: + """Convert single annotation.""" + img_file, txt_file = file + instances = list() + for poly, text in totaltext_loader(txt_file): + instances.append( + dict(poly=poly, text=text, ignore=text == self.ignore)) + + return img_file, instances diff --git a/mmocr/datasets/preparers/parsers/wildreceipt.py b/mmocr/datasets/preparers/parsers/wildreceipt.py new file mode 100644 index 000000000..180a1e78f --- /dev/null +++ b/mmocr/datasets/preparers/parsers/wildreceipt.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from typing import Dict, Tuple + +from mmocr.utils import list_from_file +from ..data_preparer import DATA_PARSER +from .base import BaseParser + + +@DATA_PARSER.register_module() +class WildreceiptTextDetParser(BaseParser): + """Wildreceipt Text Detection Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following json line format: + {"file_name": "xxx/xxx/xx/xxxx.jpeg", + "height": 1200, + "width": 1600, + "annotations": [ + "box": [x1, y1, x2, y2, x3, y3, x4, y4], + "text": "xxx", + "label": 25, + ]} + """ + + def __init__(self, + data_root: str, + ignore: int = 0, + nproc: int = 1) -> None: + self.ignore = ignore + super().__init__(data_root=data_root, nproc=nproc) + + def parse_files(self, files: Tuple, split: str) -> Dict: + """Convert single annotation.""" + closeset_lines = list_from_file(files) + samples = list() + for line in closeset_lines: + instances = list() + line = json.loads(line) + img_file = osp.join(self.data_root, line['file_name']) + for anno in line['annotations']: + poly = anno['box'] + text = anno['text'] + label = anno['label'] + instances.append( + dict(poly=poly, text=text, ignore=label == self.ignore)) + samples.append((img_file, instances)) + + return samples + + +@DATA_PARSER.register_module() +class WildreceiptKIEParser(BaseParser): + """Wildreceipt KIE Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following json line format: + {"file_name": "xxx/xxx/xx/xxxx.jpeg", + "height": 1200, + "width": 1600, + "annotations": [ + "box": [x1, y1, x2, y2, x3, y3, x4, y4], + "text": "xxx", + "label": 25, + ]} + """ + + def __init__(self, + data_root: str, + ignore: int = 0, + nproc: int = 1) -> None: + self.ignore = ignore + super().__init__(data_root=data_root, nproc=nproc) + + def parse_files(self, files: Tuple, split: str) -> Dict: + """Convert single annotation.""" + closeset_lines = list_from_file(files) + + return closeset_lines diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index 2f878107a..575194aee 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -6,7 +6,8 @@ is_type_list, valid_boundary) from .collect_env import collect_env from .data_converter_utils import dump_ocr_data, recog_anno_to_imginfo -from .fileio import list_from_file, list_to_file +from .fileio import (check_integrity, iszip, list_from_file, list_to_file, + retrieve_files) from .img_utils import crop_img, warp_img from .mask_utils import fill_hole from .parsers import LineJsonParser, LineStrParser @@ -40,5 +41,6 @@ 'ConfigType', 'DetSampleList', 'RecForwardResults', 'InitConfigType', 'OptConfigType', 'OptDetSampleList', 'OptInitConfigType', 'OptMultiConfig', 'OptRecSampleList', 'RecSampleList', 'MultiConfig', 'OptTensor', - 'ColorType', 'OptKIESampleList', 'KIESampleList' + 'ColorType', 'OptKIESampleList', 'KIESampleList', 'iszip', + 'check_integrity', 'retrieve_files' ] diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py index d5651d844..05020ec3f 100644 --- a/mmocr/utils/fileio.py +++ b/mmocr/utils/fileio.py @@ -1,7 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -import os +import hashlib +import os.path as osp +import sys +from glob import glob +from typing import List -import mmengine +from mmengine import mkdir_or_exist def list_to_file(filename, lines): @@ -11,7 +15,7 @@ def list_to_file(filename, lines): filename (str): The output filename. It will be created/overwritten. lines (list(str)): Data to be written. """ - mmengine.mkdir_or_exist(os.path.dirname(filename)) + mkdir_or_exist(osp.dirname(filename)) with open(filename, 'w', encoding='utf-8') as fw: for line in lines: fw.write(f'{line}\n') @@ -36,3 +40,66 @@ def list_from_file(filename, encoding='utf-8'): for line in f: item_list.append(line.rstrip('\n\r')) return item_list + + +def iszip(file_path: str) -> bool: + """Check whether the file is a supported zip format. + + Args: + file_path (str): Path to the file. + + Returns: + bool: Whether the file is a zip. + """ + + suffixes = ['zip', 'tar', 'tar.gz'] + + for suffix in suffixes: + if file_path.endswith(suffix): + return True + return False + + +def check_integrity(file_path: str, + md5: str, + chunk_size: int = 1024 * 1024) -> bool: + """Check if the file exist and match to the given md5 code. + + Args: + file_path (str): Path to the file. + md5 (str): MD5 to be matched. + chunk_size (int, optional): Chunk size. Defaults to 1024*1024. + + Returns: + bool: Whether the md5 is matched. + """ + if not osp.exists(file_path): + return False + + if sys.version_info >= (3, 9): + hash = hashlib.md5(usedforsecurity=False) + else: + hash = hashlib.md5() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): + hash.update(chunk) + + return hash.hexdigest() == md5 + + +def retrieve_files(path: str, suffixes: List) -> List: + """Retrieve file list from the path. + + Args: + path (str): Path to the directory. + suffixes (list[str], optional): Suffixes to be retrieved. + + Returns: + List: List of the files. + """ + + file_list = [] + for suffix in suffixes: + file_list.extend(glob(osp.join(path, '*' + suffix))) + + return file_list diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py new file mode 100644 index 000000000..d1403b342 --- /dev/null +++ b/tools/dataset_converters/prepare_dataset.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +import warnings + +from mmocr.datasets.preparers import DatasetPreparer +from mmocr.utils import register_all_modules + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Preparing datasets used in MMOCR.') + parser.add_argument( + 'datasets', + help='A list of the dataset names that would like to prepare.', + nargs='+') + parser.add_argument( + '--nproc', help='Number of processes to run', default=4, type=int) + parser.add_argument( + '--task', + default='textdet', + choices=['textdet', 'textrecog', 'textspotting', 'kie'], + help='Task type. Options are det and rec.') + parser.add_argument( + '--dataset-zoo-path', + default='./dataset_zoo', + help='Path to dataset zoo config files.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + register_all_modules() + for dataset in args.datasets: + if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)): + warnings.warn(f'{dataset} is not supported yet. Please check ' + 'dataset zoo for supported datasets.') + continue + preparer = DatasetPreparer( + cfg_path=args.dataset_zoo_path, + dataset_name=dataset, + task=args.task, + nproc=args.nproc) + preparer() + + +if __name__ == '__main__': + main() From 3b1e828a13f4b05829df45dec12276d4d2524077 Mon Sep 17 00:00:00 2001 From: xinyu Date: Mon, 24 Oct 2022 16:45:32 +0800 Subject: [PATCH 02/20] temporarily ignore data preparer test --- .dev_scripts/covignore.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dev_scripts/covignore.cfg b/.dev_scripts/covignore.cfg index 213afd4b8..00ec54b01 100644 --- a/.dev_scripts/covignore.cfg +++ b/.dev_scripts/covignore.cfg @@ -13,3 +13,6 @@ mmocr/models/textdet/detectors/mmdet_wrapper.py # It will be removed after KieVisualizer and TextSpotterVisualizer mmocr/visualization/visualize.py + +# Add tests for data preparers later +mmocr/datasets/preparers From eed854d334588d2299865a0fe26111d5318528d4 Mon Sep 17 00:00:00 2001 From: xinyu Date: Wed, 26 Oct 2022 16:08:46 +0800 Subject: [PATCH 03/20] update --- dataset_zoo/icdar2015/textdet.py | 6 +- dataset_zoo/icdar2015/textrecog.py | 6 +- dataset_zoo/totaltext/textdet.py | 6 +- dataset_zoo/wildreceipt/kie.py | 6 +- dataset_zoo/wildreceipt/textdet.py | 2 +- .../data_prepare/dataset_preparer.md | 116 +++++++++++++++++- docs/en/user_guides/data_prepare/det.md | 2 +- docs/en/user_guides/data_prepare/kie.md | 2 +- docs/en/user_guides/data_prepare/recog.md | 2 +- .../data_prepare/dataset_preparer.md | 114 ++++++++++++++++- docs/zh_cn/user_guides/data_prepare/det.md | 2 +- docs/zh_cn/user_guides/data_prepare/kie.md | 2 +- docs/zh_cn/user_guides/data_prepare/recog.md | 2 +- mmocr/datasets/preparers/data_converter.py | 105 ++++++++-------- mmocr/datasets/preparers/data_obtainer.py | 8 +- mmocr/datasets/preparers/data_preparer.py | 18 ++- mmocr/datasets/preparers/dumpers/dumpers.py | 6 +- mmocr/datasets/preparers/parsers/__init__.py | 10 +- mmocr/datasets/preparers/parsers/base.py | 45 ++++++- .../datasets/preparers/parsers/ic15_parser.py | 44 +++++-- mmocr/datasets/preparers/parsers/loaders.py | 93 -------------- .../preparers/parsers/totaltext_parser.py | 79 +++++++++++- .../datasets/preparers/parsers/wildreceipt.py | 22 +++- mmocr/utils/__init__.py | 4 +- mmocr/utils/fileio.py | 6 +- 25 files changed, 495 insertions(+), 213 deletions(-) delete mode 100644 mmocr/datasets/preparers/parsers/loaders.py diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py index 2c4310863..71c00c798 100644 --- a/dataset_zoo/icdar2015/textdet.py +++ b/dataset_zoo/icdar2015/textdet.py @@ -1,5 +1,5 @@ data_root = './data/icdar2015' -cache_path = './data/.cache' +cache_path = './data/cache' data_obtainer = dict( type='NaiveDataObtainer', @@ -42,10 +42,10 @@ type='TextDetDataConverter', splits=['train', 'test'], data_root=data_root, - gather=dict( + gatherer=dict( type='pair_gather', suffixes=['.jpg', '.JPG'], rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), - parser=dict(type='ICDAR2015TextDetParser'), + parser=dict(type='ICDAR2015TextDetAnnParser'), dumper=dict(type='JsonDumper'), delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py index febf6c4ae..83ed40e55 100644 --- a/dataset_zoo/icdar2015/textrecog.py +++ b/dataset_zoo/icdar2015/textrecog.py @@ -1,5 +1,5 @@ data_root = './data/icdar2015' -cache_path = './data/.cache' +cache_path = './data/cache' data_obtainer = dict( type='NaiveDataObtainer', @@ -37,6 +37,6 @@ type='TextRecogDataConverter', splits=['train', 'test'], data_root=data_root, - gather=dict(type='mono_gather', mapping="f'{split}.txt'"), - parser=dict(type='ICDAR2015TextRecogParser'), + gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"), + parser=dict(type='ICDAR2015TextRecogAnnParser'), dumper=dict(type='JsonDumper')) diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py index 8ad044989..dad673128 100644 --- a/dataset_zoo/totaltext/textdet.py +++ b/dataset_zoo/totaltext/textdet.py @@ -1,5 +1,5 @@ data_root = './data/totaltext' -cache_path = './data/.cache' +cache_path = './data/cache' data_obtainer = dict( type='NaiveDataObtainer', @@ -30,10 +30,10 @@ type='TextDetDataConverter', splits=['train', 'test'], data_root=data_root, - gather=dict( + gatherer=dict( type='pair_gather', suffixes=['.jpg', '.JPG'], rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']), - parser=dict(type='TotaltextTextDetParser', data_root=data_root), + parser=dict(type='TotaltextTextDetAnnParser', data_root=data_root), dumper=dict(type='JsonDumper'), delete=['totaltext', 'txt_format', 'annotations']) diff --git a/dataset_zoo/wildreceipt/kie.py b/dataset_zoo/wildreceipt/kie.py index 6a1f3b613..32685411b 100644 --- a/dataset_zoo/wildreceipt/kie.py +++ b/dataset_zoo/wildreceipt/kie.py @@ -1,5 +1,5 @@ data_root = './data/wildreceipt' -cache_path = './data/.cache' +cache_path = './data/cache' data_obtainer = dict( type='NaiveDataObtainer', @@ -25,8 +25,8 @@ type='WildReceiptConverter', splits=['train', 'test'], data_root=data_root, - gather=dict( + gatherer=dict( type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root), - parser=dict(type='WildreceiptKIEParser', data_root=data_root), + parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root), dumper=dict(type='WildreceiptOpensetDumper'), delete=['wildreceipt']) diff --git a/dataset_zoo/wildreceipt/textdet.py b/dataset_zoo/wildreceipt/textdet.py index cffb8d59d..faa0decfe 100644 --- a/dataset_zoo/wildreceipt/textdet.py +++ b/dataset_zoo/wildreceipt/textdet.py @@ -2,5 +2,5 @@ data_converter = dict( type='TextDetDataConverter', - parser=dict(type='WildreceiptTextDetParser'), + parser=dict(type='WildreceiptTextDetAnnParser'), dumper=dict(type='JsonDumper')) diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index d95b51359..e08cd4ee9 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -14,7 +14,7 @@ python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK | ------------ | ---- | ----------------------------------------------------------------------------------------------------------------------------------------- | | dataset_name | str | (required) dataset name. | | --task | str | Convert the dataset to the format of a specified task supported by MMOCR. options are: 'textdet', 'textrecog', 'textspotting', and 'kie'. | -| --nproc | int | Number of processors to be used. Defaults to 4. | +| --nproc | int | Number of processes to be used. Defaults to 4. | For example, the following command shows how to use the script to prepare the ICDAR2015 dataset for text detection task. @@ -36,4 +36,116 @@ The following table shows the supported datasets. | totaltext | ✓ | ✓ | ✓ | | | wildreceipt | ✓ | ✓ | ✓ | ✓ | -## Advanced Usage\[Coming Soon\] +## Advanced Usage + +### Configuration of Dataset Preparer + +Dataset preparer uses a modular design to enhance extensibility, which allows users to extend it to other public or private datasets easily. The configuration files of the dataset preparers are stored in the `dataset_zoo/`, where all the configs of currently supported datasets can be found here. The directory structure is as follows: + +```text +dataset_zoo/ +├── icdar2015 +│ ├── metafile.yml +│ ├── textdet.py +│ ├── textrecog.py +│ └── textspotting.py +└── wildreceipt + ├── metafile.yml + ├── kie.py + ├── textdet.py + ├── textrecog.py + └── textspotting.py +``` + +`metafile.yml` is the metafile of the dataset, which contains the basic information of the dataset, including the year of publication, the author of the paper, and other information such as license. The other files named by the task are the configuration files of the dataset preparer, which are used to configure the download, decompression, format conversion, etc. of the dataset. These configs are in Python format, and their usage is completely consistent with the configuration files in MMOCR repo. See [Configuration File Documentation](../config.md) for detailed usage. + +Next, we will introduce the conventional fields and usage of the dataset preparer configuration files. + +In the configuration files, there are two fields `data_root` and `cache_path`, which are used to store the converted dataset and the temporary files such as the archived files downloaded during the data preparation process. + +```python +data_root = './data/icdar2015' +cache_path = './data/cache' +``` + +Data preparation usually contains two steps: "raw data preparation" and "format conversion and saving". Therefore, we use the `data_obtainer` and `data_converter` to configure the behavior of these two steps. In some cases, users can also ignore `data_converter` to only download and decompress the raw data, without performing format conversion and saving. Or, for the local stored dataset, use ignore `data_obtainer` to only perform format conversion and saving. + +Take the text detection task of the ICDAR2015 dataset (`dataset_zoo/icdar2015/textdet.py`) as an example: + +```python +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip', + save_name='ic15_textdet_train_img.zip', + md5='c51cbace155dcc4d98c8dd19d378f30d', + split=['train'], + content=['image'], + mapping=[['ic15_textdet_train_img', 'imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', + save_name='ic15_textdet_test_img.zip', + md5='97e4c1ddcf074ffcc75feff2b63c35dd', + split=['test'], + content=['image'], + mapping=[['ic15_textdet_test_img', 'imgs/test']]), + ]) +``` + +The default type of `data_obtainer` is `NaiveDataObtainer`, which mainly downloads and decompresses the original files to the specified directory. Here, we configure the URL, save name, MD5 value, etc. of the original dataset files through the `files` parameter. The `mapping` parameter is used to specify the path where the data is decompressed or moved. In addition, the two optional parameters `split` and `content` respectively indicate the content type stored in the compressed file and the corresponding dataset. + +```python +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gatherer=dict( + type='pair_gather', + suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict(type='ICDAR2015TextDetAnnParser'), + dumper=dict(type='JsonDumper'), + delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) +``` + +`data_converter` is responsible for loading and converting the original to the format supported by MMOCR. We provide a number of built-in data converters for different tasks, such as `TextDetDataConverter`, `TextRecogDataConverter`, `TextSpottingDataConverter`, and `WildReceiptConverter` (Since we only support WildReceipt dataset for KIE task at present, we only provide this converter for now). + +Take the text detection task as an example, `TextDetDataConverter` mainly completes the following work: + +- Collect and match the images and original annotation files, such as the image `img_1.jpg` and the annotation `gt_img_1.txt` +- Load and parse the original annotations to obtain necessary information such as the bounding box and text +- Convert the parsed data to the format supported by MMOCR +- Dump the converted data to the specified path and format + +The above steps can be configured separately through `gatherer`, `parser`, `dumper`. + +Specifically, the `gatherer` is used to collect and match the images and annotations in the original dataset. Typically, there are two relations between images and annotations, one is many-to-many, the other is many-to-one. + +```text +many-to-many +├── img_1.jpg +├── gt_img_1.txt +├── img_2.jpg +├── gt_img_2.txt +├── img_3.JPG +├── gt_img_3.txt + +one-to-many +├── img_1.jpg +├── img_2.jpg +├── img_3.JPG +├── gt.txt +``` + +Therefore, we provide two built-in gatherers, `pair_gather` and `mono_gather`, to handle the two cases. `pair_gather` is used for the case of many-to-many, and `mono_gather` is used for the case of one-to-many. `pair_gather` needs to specify the `suffixes` parameter to indicate the suffix of the image, such as `suffixes=[.jpg,.JPG]` in the above example. In addition, we need to specify the corresponding relationship between the image and the annotation file through the regular expression, such as `rule=[r'img_(\d+)\.([jJ][pP][gG])',r'gt_img_\1.txt']` in the above example. Where `\d+` is used to match the serial number of the image, `([jJ][pP][gG])` is used to match the suffix of the image, and `\_1` matches the serial number of the image and the serial number of the annotation file. + +When the image and annotation file are matched, the the original annotations will be parsed. Since the annotation format is usually varied from datasets to datasets, the parsers are usually dataset related. Then, the parser will pack the required data into the MMOCR format. + +Finally, we can specify the dumpers to decide the data format. Currently, we only support `JsonDumper` and `WildreceiptOpensetDumper`, where the former is used to save the data in the standard MMOCR Json format, and the latter is used to save the data in the Wildreceipt format. In the future, we plan to support `LMDBDumper` to save the annotation files in LMDB format. + +### Use DataPreparer to prepare customized dataset + +\[Coming Soon\] diff --git a/docs/en/user_guides/data_prepare/det.md b/docs/en/user_guides/data_prepare/det.md index f734b9ea1..f0ff5eac9 100644 --- a/docs/en/user_guides/data_prepare/det.md +++ b/docs/en/user_guides/data_prepare/det.md @@ -1,7 +1,7 @@ # Text Detection\[Deprecated\] ```{warning} -This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)! ``` ## Overview diff --git a/docs/en/user_guides/data_prepare/kie.md b/docs/en/user_guides/data_prepare/kie.md index 4e1d59dcd..c40254845 100644 --- a/docs/en/user_guides/data_prepare/kie.md +++ b/docs/en/user_guides/data_prepare/kie.md @@ -1,7 +1,7 @@ # Key Information Extraction\[Deprecated\] ```{warning} -This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)! ``` ## Overview diff --git a/docs/en/user_guides/data_prepare/recog.md b/docs/en/user_guides/data_prepare/recog.md index 63b42ad21..1d25cbea1 100644 --- a/docs/en/user_guides/data_prepare/recog.md +++ b/docs/en/user_guides/data_prepare/recog.md @@ -1,7 +1,7 @@ # Text Recognition\[Deprecated\] ```{warning} -This page is deprecated and will soon be removed. Please refer to our new [dataset preparer](./dataset_preparer.md). +This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)! ``` ## Overview diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md index 01497d8f6..014f1b572 100644 --- a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md +++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md @@ -36,4 +36,116 @@ python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task te | totaltext | ✓ | ✓ | ✓ | | | wildreceipt | ✓ | ✓ | ✓ | ✓ | -## 进阶用法\[待更新\] +## 进阶用法 + +### 数据集配置 + +数据集自动化准备脚本使用了模块化的设计,极大地增强了扩展性,用户能够很方便地配置其他公开数据集或私有数据集。数据集自动化准备脚本的配置文件被统一存储在 `dataset_zoo/` 目录下,用户可以在该目录下找到所有已由 MMOCR 官方支持的数据集准备脚本配置文件。该文件夹的目录结构如下: + +```text +dataset_zoo/ +├── icdar2015 +│ ├── metafile.yml +│ ├── textdet.py +│ ├── textrecog.py +│ └── textspotting.py +└── wildreceipt + ├── metafile.yml + ├── kie.py + ├── textdet.py + ├── textrecog.py + └── textspotting.py +``` + +其中,`metafile.yml` 是数据集的元信息文件,其中存放了对应数据集的基本信息,包括发布年份,论文作者,以及版权等其他信息。其它以任务名命名的则是数据集准备脚本的配置文件,用于配置数据集的下载、解压、格式转换等操作。这些配置文件采用了 Python 格式,其使用方法与 MMOCR 算法库的其他配置文件完全一致,详见[配置文件文档](../config.md)。 + +下面,我们将介绍数据集准备脚本配置文件的默认字段与使用方法。 + +我们在配置文件中提供了 `data_root` 与 `cache_path` 两个默认字段,分别用于存放转换后的 MMOCR 格式的数据集文件,以及在数据准备过程中下载的压缩包等临时文件。 + +```python +data_root = './data/icdar2015' +cache_path = './data/cache' +``` + +其次,数据集的准备通常包含了“原始数据准备”以及“格式转换和保存”这两个主要步骤。因此,我们约定通过 `data_obtainer` 和 `data_converter` 参数来配置这两个步骤的行为。在某些情况下,用户也可以通过缺省 `data_converter` 参数来仅进行原始数据的下载和解压,而不进行格式转换和保存。或者,对于本地存储的数据集,通过缺省 `data_obtainer` 参数来仅进行格式转换和保存。 + +以 ICDAR2015 数据集的文本检测任务准备配置文件(`dataset_zoo/icdar2015/textdet.py`)为例: + +```python +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip', + save_name='ic15_textdet_train_img.zip', + md5='c51cbace155dcc4d98c8dd19d378f30d', + split=['train'], + content=['image'], + mapping=[['ic15_textdet_train_img', 'imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', + save_name='ic15_textdet_test_img.zip', + md5='97e4c1ddcf074ffcc75feff2b63c35dd', + split=['test'], + content=['image'], + mapping=[['ic15_textdet_test_img', 'imgs/test']]), + ]) +``` + +数据准备器 `data_obtainer` 的类型默认为 `NaiveDataObtainer`,其主要功能是依次下载压缩包并解压到指定目录。在这里,我们通过 `files` 参数来配置下载的压缩包的 URL、保存名称、MD5 值等信息。其中,`mapping` 参数用于指定该压缩包中的数据解压后的存放路径。另外 `split` 和 `content` 这两个可选参数则分别标明了该压缩包中存储的内容类型与其对应的数据集合。 + +```python +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gatherer=dict( + type='pair_gather', + suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict(type='ICDAR2015TextDetAnnParser'), + dumper=dict(type='JsonDumper'), + delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) +``` + +数据转换器 `data_converter` 负责完成原始数据的读取与格式转换,并保存为 MMOCR 支持的格式。其中我们针对不同的任务,提供了内置的集中数据转换器,如文本检测任务数据转换器 `TextDetDataConverter`,文本识别任务数据转换器 `TextRecogDataConverter`,端到端文本检测识别任务转换器 `TextSpottingDataConverter`,以及关键信息抽取任务数据转换器 `WildReceiptConverter`(由于关键信息抽取任务目前仅支持 WildReceipt 数据集,我们暂时只提供了基于该数据集的数据转换器)。 + +以文本检测任务为例,`TextDetDataConverter` 主要完成以下工作: + +- 收集并匹配原始数据集中的图片与标注文件,如图像 `img_1.jpg` 与 标注 `gt_img_1.txt` +- 读取原始标注文件,解析出文本框坐标与文本内容等必要信息 +- 将解析后的数据统一转换至 MMOCR 支持的格式 +- 将转换后的数据保存为指定路径和格式 + +以上个步骤我们分别可以通过 `gatherer`,`parser`,`dumper` 来进行配置。 + +具体而言,`gatherer` 用于收集并匹配原始数据集中的图片与标注文件。常用的 OCR 数据集通常有两种标注保存形式,一种为多个标注文件对应多张图片,一种则为单个标注文件对应多张图片,如: + +```text +多对多 +├── img_1.jpg +├── gt_img_1.txt +├── img_2.jpg +├── gt_img_2.txt +├── img_3.JPG +├── gt_img_3.txt + +单对多 +├── img_1.jpg +├── img_2.jpg +├── img_3.JPG +├── gt.txt +``` + +因此,我们内置了 `pair_gather` 与 `mono_gather` 来处理以上这两种情况。其中 `pair_gather` 用于多对多的情况,`mono_gather` 用于单对多的情况。`pair_gather` 需要指定 `suffixes` 参数,用于指定图片的后缀名,如上述例子中的 `suffixes=[.jpg,.JPG]`。此外,还需要通过正则表达式来指定图片与标注文件的对应关系,如上述例子中的 `rule=[r'img_(\d+)\.([jJ][pP][gG])',r'gt_img_\1.txt']`。其中 `\d+` 用于匹配图片的序号,`([jJ][pP][gG])` 用于匹配图片的后缀名,`\_1` 则将匹配到的图片序号与标注文件序号对应起来。 + +当获取了图像与标注文件的对应关系后,data preparer 将解析原始标注文件。由于不同数据集的标注格式通常有很大的区别,当我们需要支持新的数据集时,通常需要实现一个新的 `parser` 来解析原始标注文件。parser 将任务相关的数据解析后打包成 MMOCR 的统一格式。 + +最后,我们可以通过指定不同的 dumper 来决定要将数据保存为何种格式。目前,我们仅支持 `JsonDumper` 与 `WildreceiptOpensetDumper`,其中,前者用于将数据保存为标准的 MMOCR Json 格式,而后者用于将数据保存为 Wildreceipt 格式。未来,我们计划支持 `LMDBDumper` 用于保存 LMDB 格式的标注文件。 + +### 使用 Data Preparer 准备自定义数据集 + +\[待更新\] diff --git a/docs/zh_cn/user_guides/data_prepare/det.md b/docs/zh_cn/user_guides/data_prepare/det.md index dc53794a0..ba5c583bf 100644 --- a/docs/zh_cn/user_guides/data_prepare/det.md +++ b/docs/zh_cn/user_guides/data_prepare/det.md @@ -1,7 +1,7 @@ # 文字检测\[过时\] ```{warning} -该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 +该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。 ``` ## 概览 diff --git a/docs/zh_cn/user_guides/data_prepare/kie.md b/docs/zh_cn/user_guides/data_prepare/kie.md index 2e96cae55..eb5ec6741 100644 --- a/docs/zh_cn/user_guides/data_prepare/kie.md +++ b/docs/zh_cn/user_guides/data_prepare/kie.md @@ -1,7 +1,7 @@ # 关键信息提取\[过时\] ```{warning} -该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 +该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。 ``` ## 概览 diff --git a/docs/zh_cn/user_guides/data_prepare/recog.md b/docs/zh_cn/user_guides/data_prepare/recog.md index 3925ca42a..cede41ab1 100644 --- a/docs/zh_cn/user_guides/data_prepare/recog.md +++ b/docs/zh_cn/user_guides/data_prepare/recog.md @@ -1,7 +1,7 @@ # 文字识别\[过时\] ```{warning} -该页面内容已经过时并将在近期删除,请查看我们全新的[数据准备](./dataset_preparer.md)页面。 +该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。 ``` ## 概览 diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index 26f37e327..a1a2e883f 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -11,7 +11,7 @@ from mmengine import mkdir_or_exist, track_parallel_progress from mmocr.utils import bbox2poly, crop_img, poly2bbox, retrieve_files -from .data_preparer import DATA_CONVERTER, DATA_DUMPER, DATA_PARSER +from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS class BaseDataConverter: @@ -20,7 +20,7 @@ class BaseDataConverter: Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset files. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -32,7 +32,7 @@ class BaseDataConverter: def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -47,14 +47,14 @@ def __init__(self, self.delete = delete parser.update(dict(nproc=nproc)) dumper.update(dict(task=task)) - self.parser = DATA_PARSER.build(parser) - self.dumper = DATA_DUMPER.build(dumper) - gather_type = gather.pop('type') - self.gather_args = gather + self.parser = DATA_PARSERS.build(parser) + self.dumper = DATA_DUMPERS.build(dumper) + gather_type = gatherer.pop('type') + self.gatherer_args = gatherer if gather_type == 'pair_gather': - self.gather = self.pair_gather + self.gatherer = self.pair_gather elif gather_type == 'mono_gather': - self.gather = self.mono_gather + self.gatherer = self.mono_gather else: raise NotImplementedError @@ -66,10 +66,10 @@ def __call__(self): # Gather the info such as file names required by parser img_path = osp.join(self.data_root, 'imgs', split) ann_path = osp.join(self.data_root, 'annotations') - gather_args = dict( + gatherer_args = dict( img_path=img_path, ann_path=ann_path, split=split) - gather_args.update(self.gather_args) - files = self.gather(**gather_args) + gatherer_args.update(self.gatherer_args) + files = self.gatherer(**gatherer_args) # Convert dataset annotations to MMOCR format samples = self.parser.parse_files(files, split) print(f'Packing {split} annotations...') @@ -107,7 +107,7 @@ def add_meta(self, sample: Dict) -> Dict: def mono_gather(self, ann_path: str, mapping: str, split: str, **kwargs) -> str: - """Gathering the dataset file. Specifically for the case that only one + """Gather the dataset file. Specifically for the case that only one annotation file is needed. For example, img_001.jpg \ @@ -127,22 +127,26 @@ def mono_gather(self, ann_path: str, mapping: str, split: str, return osp.join(ann_path, eval(mapping)) - def pair_gather(self, img_path, suffixes, rule: Sequence, + def pair_gather(self, img_path: str, suffixes: List, rule: Sequence, **kwargs) -> List[Tuple]: - """Gathering the dataset files. Specifically for the paired - annotations. That is to say, each image has a corresponding annotation - file. For example, + """Gather the dataset files. Specifically for the paired annotations. + That is to say, each image has a corresponding annotation file. For + example, - img_001.jpg <---> gt_img_001.txt - img_002.jpg <---> gt_img_002.txt - img_003.jpg <---> gt_img_003.txt + img_1.jpg <---> gt_img_1.txt + img_2.jpg <---> gt_img_2.txt + img_3.jpg <---> gt_img_3.txt Args: img_path (str): Path to the images. suffixes (List[str]): File suffixes that used for searching. rule (Sequence): The rule for pairing the files. The first element is the matching pattern for the file, and the - second element is the replacement pattern. + second element is the replacement pattern, which should + be a regular expression. For example, to map the image + name img_1.jpg to the annotation name gt_img_1.txt, + the rule is + [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501 Returns: List[Tuple]: A list of tuples (img_path, ann_path). @@ -163,14 +167,14 @@ def clean(self) -> None: shutil.rmtree(delete_file) -@DATA_CONVERTER.register_module() +@DATA_CONVERTERS.register_module() class TextDetDataConverter(BaseDataConverter): """Text detection data converter. Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset files. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -181,7 +185,7 @@ class TextDetDataConverter(BaseDataConverter): def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -189,7 +193,7 @@ def __init__(self, super().__init__( splits=splits, data_root=data_root, - gather=gather, + gatherer=gatherer, parser=parser, dumper=dumper, nproc=nproc, @@ -203,13 +207,13 @@ def pack_instance(self, """Pack the parsed annotation info to an MMOCR format instance. Args: - sample (Tuple): A tuple of (img_file, ann_file). - - img_path (str): Path to image file. + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. - instances (Sequence[Dict]): A list of converted annos. Each element should be a dict with the following keys: - 'poly' or 'box' - - ignore - - bbox_label (optional) + - 'ignore' + - 'bbox_label' (optional) split (str): The split of the instance. Returns: @@ -236,7 +240,7 @@ def pack_instance(self, packed_instances = dict( instances=packed_instances, - img_path=img_path.replace(self.data_root, ''), + img_path=img_path.replace(self.data_root + '/', ''), height=h, width=w) @@ -257,14 +261,14 @@ def add_meta(self, sample: Dict) -> Dict: return meta -@DATA_CONVERTER.register_module() +@DATA_CONVERTERS.register_module() class TextSpottingDataConverter(BaseDataConverter): """Text spotting data converter. Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset files. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -275,7 +279,7 @@ class TextSpottingDataConverter(BaseDataConverter): def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -283,7 +287,7 @@ def __init__(self, super().__init__( splits=splits, data_root=data_root, - gather=gather, + gatherer=gatherer, parser=parser, dumper=dumper, nproc=nproc, @@ -302,8 +306,9 @@ def pack_instance(self, - instances (Sequence[Dict]): A list of converted annos. Each element should be a dict with the following keys: - 'poly' or 'box' - - ignore - - bbox_label (optional) + - 'text' + - 'ignore' + - 'bbox_label' (optional) split (str): The split of the instance. Returns: @@ -350,14 +355,14 @@ def add_meta(self, sample: Dict) -> Dict: return meta -@DATA_CONVERTER.register_module() +@DATA_CONVERTERS.register_module() class TextRecogDataConverter(BaseDataConverter): """Text recognition data converter. Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset annotations. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -368,7 +373,7 @@ class TextRecogDataConverter(BaseDataConverter): def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -376,7 +381,7 @@ def __init__(self, super().__init__( splits=splits, data_root=data_root, - gather=gather, + gatherer=gatherer, parser=parser, dumper=dumper, nproc=nproc, @@ -418,7 +423,7 @@ def add_meta(self, sample: Dict) -> Dict: return meta -@DATA_CONVERTER.register_module() +@DATA_CONVERTERS.register_module() class TextRecogCropConverter(TextRecogDataConverter): """Text recognition crop converter. This converter will crop the text from the original image. The parser used for this Converter should be a TextDet @@ -427,7 +432,7 @@ class TextRecogCropConverter(TextRecogDataConverter): Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset annotations. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -444,7 +449,7 @@ class TextRecogCropConverter(TextRecogDataConverter): def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -455,7 +460,7 @@ def __init__(self, super().__init__( splits=splits, data_root=data_root, - gather=gather, + gatherer=gatherer, parser=parser, dumper=dumper, nproc=nproc, @@ -507,7 +512,7 @@ def get_box(instance: Dict) -> List: return data_list -@DATA_CONVERTER.register_module() +@DATA_CONVERTERS.register_module() class WildReceiptConverter(BaseDataConverter): """MMOCR only supports wildreceipt dataset for KIE task now. This converter converts the wildreceipt dataset from close set to open set. @@ -515,7 +520,7 @@ class WildReceiptConverter(BaseDataConverter): Args: splits (List): A list of splits to be processed. data_root (str): Path to the data root. - gather (Dict): Config dict for gathering the dataset files. + gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset annotations. dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. @@ -530,7 +535,7 @@ class and "others" class. Defaults to True. def __init__(self, splits: List, data_root: str, - gather: Dict, + gatherer: Dict, parser: Dict, dumper: Dict, nproc: int, @@ -545,13 +550,17 @@ def __init__(self, super().__init__( splits=splits, data_root=data_root, - gather=gather, + gatherer=gatherer, parser=parser, dumper=dumper, nproc=nproc, task='kie', delete=delete) + def add_meta(self, samples: List) -> List: + """No meta info is required for the wildreceipt dataset.""" + return samples + def pack_instance(self, sample: str, split: str): """Pack line-json str of close set to line-json str of open set. diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py index 13311e790..cfa9a2921 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -8,13 +8,13 @@ from mmengine import mkdir_or_exist -from mmocr.utils import check_integrity, iszip -from .data_preparer import DATA_OBTAINER +from mmocr.utils import check_integrity, is_archive +from .data_preparer import DATA_OBTAINERS ssl._create_default_https_context = ssl._create_unverified_context -@DATA_OBTAINER.register_module() +@DATA_OBTAINERS.register_module() class NaiveDataObtainer: """A naive pipeline for obtaining dataset. @@ -93,7 +93,7 @@ def extract(self, to False. """ - if not iszip(src_path): + if not is_archive(src_path): # Move the file to the destination folder if it is not a zip shutil.move(src_path, dst_path) return diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 094a7c056..84f5390ee 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -5,10 +5,10 @@ from mmengine import Registry from mmengine.config import Config -DATA_OBTAINER = Registry('data_obtainer') -DATA_CONVERTER = Registry('data_converter') -DATA_PARSER = Registry('data_parser') -DATA_DUMPER = Registry('data_dumper') +DATA_OBTAINERS = Registry('data_obtainer') +DATA_CONVERTERS = Registry('data_converter') +DATA_PARSERS = Registry('data_parser') +DATA_DUMPERS = Registry('data_dumper') class DatasetPreparer: @@ -93,19 +93,17 @@ def parse_cfg(self, cfg_path: str) -> None: cfg = Config.fromfile(osp.join(cfg_path, self.task + '.py')) if 'data_obtainer' in cfg: - self.data_obtainer = DATA_OBTAINER.build(cfg.data_obtainer) + self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer) if 'data_converter' in cfg: cfg.data_converter.update(dict(nproc=self.nproc)) - self.data_converter = DATA_CONVERTER.build(cfg.data_converter) + self.data_converter = DATA_CONVERTERS.build(cfg.data_converter) @property def with_obtainer(self) -> bool: """bool: whether the data preparer has an obtainer""" - return hasattr(self, - 'data_obtainer') and self.data_obtainer is not None + return getattr(self, 'data_obtainer', None) is not None @property def with_processor(self) -> bool: """bool: whether the data preparer has an obtainer""" - return hasattr(self, - 'data_converter') and self.data_converter is not None + return getattr(self, 'data_converter', None) is not None diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py index c6fe205c0..93543cd88 100644 --- a/mmocr/datasets/preparers/dumpers/dumpers.py +++ b/mmocr/datasets/preparers/dumpers/dumpers.py @@ -5,10 +5,10 @@ import mmengine from mmocr.utils import list_to_file -from ..data_preparer import DATA_DUMPER +from ..data_preparer import DATA_DUMPERS -@DATA_DUMPER.register_module() +@DATA_DUMPERS.register_module() class JsonDumper: def __init__(self, task: str) -> None: @@ -20,7 +20,7 @@ def dump(self, data: List, data_root: str, split: str) -> None: mmengine.dump(data, dst_file) -@DATA_DUMPER.register_module() +@DATA_DUMPERS.register_module() class WildreceiptOpensetDumper: def __init__(self, task: str) -> None: diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 66070ccc2..1ee50a34c 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,9 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .ic15_parser import ICDAR2015TextDetParser, ICDAR2015TextRecogParser -from .totaltext_parser import TotaltextTextDetParser -from .wildreceipt import WildreceiptKIEParser +from .ic15_parser import ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser +from .totaltext_parser import TotaltextTextDetAnnParser +from .wildreceipt import WildreceiptKIEAnnParser __all__ = [ - 'ICDAR2015TextDetParser', 'ICDAR2015TextRecogParser', - 'TotaltextTextDetParser', 'WildreceiptKIEParser' + 'ICDAR2015TextDetAnnParser', 'ICDAR2015TextRecogAnnParser', + 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py index 83ef704a4..d6c93a01e 100644 --- a/mmocr/datasets/preparers/parsers/base.py +++ b/mmocr/datasets/preparers/parsers/base.py @@ -1,12 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod from functools import partial -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from mmengine import track_parallel_progress class BaseParser: + """Base class for parsing annotations. + + Args: + data_root (str, optional): Path to the data root. Defaults to None. + nproc (int, optional): Number of processes. Defaults to 1. + """ def __init__(self, data_root: Optional[str] = None, @@ -15,6 +21,16 @@ def __init__(self, self.nproc = nproc def __call__(self, files: List[Tuple], split: str) -> List: + """Parse annotations. + + Args: + files (List[Tuple]): A list of a tuple of + (image_path, annotation_path). + split (str): The split of the dataset. + + Returns: + List: A list of a tuple of (image_path, instances) + """ samples = self.parse_files(files, split) return samples @@ -25,7 +41,7 @@ def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: files (Tuple): A tuple of path to image and annotation. Returns: - List[Tuple]: A list of a tuple of (image_path, instance) + List[Tuple]: A list of a tuple of (image_path, instances) """ func = partial(self.parse_file, split=split) samples = track_parallel_progress(func, files, nproc=self.nproc) @@ -35,3 +51,28 @@ def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: def parse_file(self, file: Tuple, split: str) -> Dict: """Convert annotation for a single image.""" raise NotImplementedError + + def loader(self, + file_path: str, + separator: str = ',', + format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', + encoding='utf-8') -> Union[Dict, str]: + """A basic loader designed for .txt format annotation. + + Args: + file_path (str): Path to the txt file. + separator (str, optional): Separator of data. Defaults to ','. + format (str, optional): Annotation format. + Defaults to 'x1,y1,x2,y2,x3,y3,x4,y4,trans'. + encoding (str, optional): Encoding format. Defaults to 'utf-8'. + + Yields: + Iterator[Union[Dict, str]]: Original text line or a dict containing + the information of the text line. + """ + keys = format.split(separator) + with open(file_path, 'r', encoding=encoding) as f: + for line in f.readlines(): + line = line.strip() + if line: + yield dict(zip(keys, line.split(separator))) diff --git a/mmocr/datasets/preparers/parsers/ic15_parser.py b/mmocr/datasets/preparers/parsers/ic15_parser.py index b40cdb26c..fff7b81e8 100644 --- a/mmocr/datasets/preparers/parsers/ic15_parser.py +++ b/mmocr/datasets/preparers/parsers/ic15_parser.py @@ -1,18 +1,28 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Dict, List, Tuple -from ..data_preparer import DATA_PARSER +from ..data_preparer import DATA_PARSERS from .base import BaseParser -from .loaders import txt_loader -@DATA_PARSER.register_module() -class ICDAR2015TextDetParser(BaseParser): +@DATA_PARSERS.register_module() +class ICDAR2015TextDetAnnParser(BaseParser): """ICDAR2015 Text Detection Parser. The original annotation format of this dataset is stored in txt files, - which is formed as the following format: x1, y1, x2, y2, x3, y3, x4, y4, - transcription + which is formed as the following format: + x1, y1, x2, y2, x3, y3, x4, y4, transcription + + Args: + separator (str): The separator between each element in a line. Defaults + to ','. + ignore (str): The text to be ignored. Defaults to '###'. + format (str): The format of the annotation. Defaults to + 'x1,y1,x2,y2,x3,y3,x4,trans'. + encoding (str): The encoding of the annotation file. Defaults to + 'utf-8-sig'. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. """ def __init__(self, @@ -31,7 +41,8 @@ def parse_file(self, file: Tuple, split: str) -> Dict: """Parse single annotation.""" img_file, txt_file = file instances = list() - for anno in txt_loader(txt_file, self.sep, self.format, self.encoding): + for anno in self.loader(txt_file, self.sep, self.format, + self.encoding): anno = list(anno.values()) poly = list(map(float, anno[0:-1])) text = anno[-1] @@ -41,12 +52,23 @@ def parse_file(self, file: Tuple, split: str) -> Dict: return img_file, instances -@DATA_PARSER.register_module() -class ICDAR2015TextRecogParser(BaseParser): +@DATA_PARSERS.register_module() +class ICDAR2015TextRecogAnnParser(BaseParser): """ICDAR2015 Text Detection Parser. The original annotation format of this dataset is stored in txt files, - which is formed as the following format: img_path, transcription + which is formed as the following format: + img_path, transcription + + Args: + separator (str): The separator between each element in a line. Defaults + to ','. + ignore (str): The text to be ignored. Defaults to '#'. + format (str): The format of the annotation. Defaults to 'img, text'. + encoding (str): The encoding of the annotation file. Defaults to + 'utf-8-sig'. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. """ def __init__(self, @@ -65,7 +87,7 @@ def parse_files(self, files: str, split: str) -> List: """Parse annotations.""" assert isinstance(files, str) samples = list() - for anno in txt_loader( + for anno in self.loader( file_path=files, format=self.format, encoding=self.encoding): text = anno['text'].strip().replace('"', '') samples.append((anno['img'], text)) diff --git a/mmocr/datasets/preparers/parsers/loaders.py b/mmocr/datasets/preparers/parsers/loaders.py deleted file mode 100644 index 90145b6b5..000000000 --- a/mmocr/datasets/preparers/parsers/loaders.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import re -from typing import Dict, Tuple, Union - -import yaml - - -def txt_loader(file_path: str, - separator: str = ',', - format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', - encoding='utf-8') -> Union[Dict, str]: - """Loading txt format annotations. - - Args: - file_path (str): Path to the txt file. - separator (str, optional): Separator of data. Defaults to ','. - format (str, optional): Annotation format. - Defaults to 'x1,y1,x2,y2,x3,y3,x4,y4,trans'. - encoding (str, optional): Encoding format. Defaults to 'utf-8'. - - Yields: - Iterator[Union[Dict, str]]: Original text line or a dict containing - the information of the text line. - """ - keys = format.split(separator) - with open(file_path, 'r', encoding=encoding) as f: - for line in f.readlines(): - line = line.strip() - if line: - yield dict(zip(keys, line.split(separator))) - - -def totaltext_loader(file_path: str) -> str: - """The annotation of the totaltext dataset may be stored in multiple lines, - this loader is designed for this special case. - - Args: - file_path (str): Path to the txt file - - Yield: - str: Complete annotation of the txt file - """ - - def parsing_line(line: str) -> Tuple: - """Parsing a line of the annotation. - - Args: - line (str): A line of the annotation. - - Returns: - Tuple: A tuple of (polygon, transcription). - """ - line = '{' + line.replace('[[', '[').replace(']]', ']') + '}' - ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line) - ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict) - ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict) - ann_dict = ann_dict.replace("[u',']", "[u'#']") - ann_dict = yaml.safe_load(ann_dict) - - # polygon - xs, ys = ann_dict['x'], ann_dict['y'] - poly = [] - for x, y in zip(xs, ys): - poly.append(x) - poly.append(y) - # text - text = ann_dict['transcriptions'] - if len(text) == 0: - text = '#' - else: - word = text[0] - if len(text) > 1: - for ann_word in text[1:]: - word += ',' + ann_word - text = str(eval(word)) - - return poly, text - - with open(file_path, 'r') as f: - for idx, line in enumerate(f): - line = line.strip() - if idx == 0: - tmp_line = line - continue - if not line.startswith('x:'): - tmp_line += ' ' + line - continue - complete_line = tmp_line - tmp_line = line - yield parsing_line(complete_line) - - if tmp_line != '': - yield parsing_line(tmp_line) diff --git a/mmocr/datasets/preparers/parsers/totaltext_parser.py b/mmocr/datasets/preparers/parsers/totaltext_parser.py index 199d5258d..1a7d65c5f 100644 --- a/mmocr/datasets/preparers/parsers/totaltext_parser.py +++ b/mmocr/datasets/preparers/parsers/totaltext_parser.py @@ -1,19 +1,26 @@ # Copyright (c) OpenMMLab. All rights reserved. +import re from typing import Dict, Tuple -from ..data_preparer import DATA_PARSER +import yaml + +from ..data_preparer import DATA_PARSERS from .base import BaseParser -from .loaders import totaltext_loader -@DATA_PARSER.register_module() -class TotaltextTextDetParser(BaseParser): +@DATA_PARSERS.register_module() +class TotaltextTextDetAnnParser(BaseParser): """TotalText Text Detection Parser. The original annotation format of this dataset is stored in txt files, which is formed as the following format: x: [[x1 x2 x3 ... xn]], y: [[y1 y2 y3 ... yn]], ornt: [u'c'], transcriptions: [u'transcription'] + + Args: + data_root (str): Path to the dataset root. + ignore (str): The text of the ignored instances. Default: '#'. + nproc (int): Number of processes to load the data. Default: 1. """ def __init__(self, @@ -27,8 +34,70 @@ def parse_file(self, file: Tuple, split: str) -> Dict: """Convert single annotation.""" img_file, txt_file = file instances = list() - for poly, text in totaltext_loader(txt_file): + for poly, text in self.loader(txt_file): instances.append( dict(poly=poly, text=text, ignore=text == self.ignore)) return img_file, instances + + def loader(self, file_path: str) -> str: + """The annotation of the totaltext dataset may be stored in multiple + lines, this loader is designed for this special case. + + Args: + file_path (str): Path to the txt file + + Yield: + str: Complete annotation of the txt file + """ + + def parsing_line(line: str) -> Tuple: + """Parsing a line of the annotation. + + Args: + line (str): A line of the annotation. + + Returns: + Tuple: A tuple of (polygon, transcription). + """ + line = '{' + line.replace('[[', '[').replace(']]', ']') + '}' + ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line) + ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict) + ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict) + ann_dict = ann_dict.replace("[u',']", "[u'#']") + ann_dict = yaml.safe_load(ann_dict) + + # polygon + xs, ys = ann_dict['x'], ann_dict['y'] + poly = [] + for x, y in zip(xs, ys): + poly.append(x) + poly.append(y) + # text + text = ann_dict['transcriptions'] + if len(text) == 0: + text = '#' + else: + word = text[0] + if len(text) > 1: + for ann_word in text[1:]: + word += ',' + ann_word + text = str(eval(word)) + + return poly, text + + with open(file_path, 'r') as f: + for idx, line in enumerate(f): + line = line.strip() + if idx == 0: + tmp_line = line + continue + if not line.startswith('x:'): + tmp_line += ' ' + line + continue + complete_line = tmp_line + tmp_line = line + yield parsing_line(complete_line) + + if tmp_line != '': + yield parsing_line(tmp_line) diff --git a/mmocr/datasets/preparers/parsers/wildreceipt.py b/mmocr/datasets/preparers/parsers/wildreceipt.py index 180a1e78f..b1a952367 100644 --- a/mmocr/datasets/preparers/parsers/wildreceipt.py +++ b/mmocr/datasets/preparers/parsers/wildreceipt.py @@ -4,12 +4,12 @@ from typing import Dict, Tuple from mmocr.utils import list_from_file -from ..data_preparer import DATA_PARSER +from ..data_preparer import DATA_PARSERS from .base import BaseParser -@DATA_PARSER.register_module() -class WildreceiptTextDetParser(BaseParser): +@DATA_PARSERS.register_module() +class WildreceiptTextDetAnnParser(BaseParser): """Wildreceipt Text Detection Parser. The original annotation format of this dataset is stored in txt files, @@ -22,6 +22,12 @@ class WildreceiptTextDetParser(BaseParser): "text": "xxx", "label": 25, ]} + + Args: + data_root (str): The root path of the dataset. + ignore (int): The label to be ignored. Defaults to 0. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. """ def __init__(self, @@ -50,8 +56,8 @@ def parse_files(self, files: Tuple, split: str) -> Dict: return samples -@DATA_PARSER.register_module() -class WildreceiptKIEParser(BaseParser): +@DATA_PARSERS.register_module() +class WildreceiptKIEAnnParser(BaseParser): """Wildreceipt KIE Parser. The original annotation format of this dataset is stored in txt files, @@ -64,6 +70,12 @@ class WildreceiptKIEParser(BaseParser): "text": "xxx", "label": 25, ]} + + Args: + data_root (str): The root path of the dataset. + ignore (int): The label to be ignored. Defaults to 0. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. """ def __init__(self, diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index 575194aee..2e0173121 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -6,7 +6,7 @@ is_type_list, valid_boundary) from .collect_env import collect_env from .data_converter_utils import dump_ocr_data, recog_anno_to_imginfo -from .fileio import (check_integrity, iszip, list_from_file, list_to_file, +from .fileio import (check_integrity, is_archive, list_from_file, list_to_file, retrieve_files) from .img_utils import crop_img, warp_img from .mask_utils import fill_hole @@ -41,6 +41,6 @@ 'ConfigType', 'DetSampleList', 'RecForwardResults', 'InitConfigType', 'OptConfigType', 'OptDetSampleList', 'OptInitConfigType', 'OptMultiConfig', 'OptRecSampleList', 'RecSampleList', 'MultiConfig', 'OptTensor', - 'ColorType', 'OptKIESampleList', 'KIESampleList', 'iszip', + 'ColorType', 'OptKIESampleList', 'KIESampleList', 'is_archive', 'check_integrity', 'retrieve_files' ] diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py index 05020ec3f..efd55680e 100644 --- a/mmocr/utils/fileio.py +++ b/mmocr/utils/fileio.py @@ -42,14 +42,14 @@ def list_from_file(filename, encoding='utf-8'): return item_list -def iszip(file_path: str) -> bool: - """Check whether the file is a supported zip format. +def is_archive(file_path: str) -> bool: + """Check whether the file is a supported archive format. Args: file_path (str): Path to the file. Returns: - bool: Whether the file is a zip. + bool: Whether the file is an archive. """ suffixes = ['zip', 'tar', 'tar.gz'] From bd5c8250b6d1ca0e7920bc22d3a22f90517cd60a Mon Sep 17 00:00:00 2001 From: xinyu Date: Thu, 27 Oct 2022 10:29:01 +0800 Subject: [PATCH 04/20] fix comments --- dataset_zoo/icdar2015/textdet.py | 4 ++-- dataset_zoo/icdar2015/textrecog.py | 4 ++-- dataset_zoo/totaltext/textdet.py | 4 ++-- dataset_zoo/wildreceipt/kie.py | 4 ++-- docs/zh_cn/get_started/install.md | 2 +- mmocr/datasets/preparers/data_preparer.py | 14 +++++++++----- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py index 71c00c798..1d78936b7 100644 --- a/dataset_zoo/icdar2015/textdet.py +++ b/dataset_zoo/icdar2015/textdet.py @@ -1,5 +1,5 @@ -data_root = './data/icdar2015' -cache_path = './data/cache' +data_root = 'data/icdar2015' +cache_path = 'data/cache' data_obtainer = dict( type='NaiveDataObtainer', diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py index 83ed40e55..30335ee10 100644 --- a/dataset_zoo/icdar2015/textrecog.py +++ b/dataset_zoo/icdar2015/textrecog.py @@ -1,5 +1,5 @@ -data_root = './data/icdar2015' -cache_path = './data/cache' +data_root = 'data/icdar2015' +cache_path = 'data/cache' data_obtainer = dict( type='NaiveDataObtainer', diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py index dad673128..0471e4f3b 100644 --- a/dataset_zoo/totaltext/textdet.py +++ b/dataset_zoo/totaltext/textdet.py @@ -1,5 +1,5 @@ -data_root = './data/totaltext' -cache_path = './data/cache' +data_root = 'data/totaltext' +cache_path = 'data/cache' data_obtainer = dict( type='NaiveDataObtainer', diff --git a/dataset_zoo/wildreceipt/kie.py b/dataset_zoo/wildreceipt/kie.py index 32685411b..66ab937dd 100644 --- a/dataset_zoo/wildreceipt/kie.py +++ b/dataset_zoo/wildreceipt/kie.py @@ -1,5 +1,5 @@ -data_root = './data/wildreceipt' -cache_path = './data/cache' +data_root = 'data/wildreceipt' +cache_path = 'data/cache' data_obtainer = dict( type='NaiveDataObtainer', diff --git a/docs/zh_cn/get_started/install.md b/docs/zh_cn/get_started/install.md index ae9c4c42a..1ee9ac810 100644 --- a/docs/zh_cn/get_started/install.md +++ b/docs/zh_cn/get_started/install.md @@ -118,7 +118,7 @@ python mmocr/ocr.py --det DB_r18 --recog CRNN demo/demo_text_ocr.jpg --show 也可以在 Python 解释器中运行以下代码: ```python -from mmocr.utils.ocr import MMOCR +from mmocr.ocr import MMOCR ocr = MMOCR(recog='CRNN', det='DB_r18') ocr.readtext('demo_text_ocr.jpg', show=True) ``` diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 84f5390ee..9f9f81857 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -17,11 +17,12 @@ class DatasetPreparer: Dataset preparer is used to prepare dataset for MMOCR. It mainly consists of two steps: - 1. Obtaining the dataset + 1. Obtain the dataset - Download - Extract - Move/Rename 2. Process the dataset + - Parse original annotations - Convert to mmocr format - Dump the annotation file - Clean useless files @@ -55,7 +56,7 @@ def __call__(self): """Prepare the dataset.""" if self.with_obtainer: self.data_obtainer() - if self.with_processor: + if self.with_converter: self.data_converter() def parse_meta(self, cfg_path: str) -> None: @@ -64,7 +65,10 @@ def parse_meta(self, cfg_path: str) -> None: Args: cfg_path (str): Path to meta file. """ - meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml')) + try: + meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml')) + except FileNotFoundError: + return assert self.task in meta['Data']['Tasks'], \ f'Task {self.task} not supported!' # License related @@ -104,6 +108,6 @@ def with_obtainer(self) -> bool: return getattr(self, 'data_obtainer', None) is not None @property - def with_processor(self) -> bool: - """bool: whether the data preparer has an obtainer""" + def with_converter(self) -> bool: + """bool: whether the data preparer has an converter""" return getattr(self, 'data_converter', None) is not None From fb9eb270fd082519bd55918ee2c8db7d396548f5 Mon Sep 17 00:00:00 2001 From: xinyu Date: Thu, 27 Oct 2022 15:38:11 +0800 Subject: [PATCH 05/20] update doc; add script to generate dataset zoo doc --- dataset_zoo/icdar2015/metafile.yml | 3 + dataset_zoo/totaltext/metafile.yml | 2 + dataset_zoo/wildreceipt/metafile.yml | 7 +++ docs/en/conf.py | 1 + docs/en/dataset_zoo.py | 55 +++++++++++++++++++ docs/en/index.rst | 1 + .../data_prepare/dataset_preparer.md | 47 +++++++++++++--- docs/zh_cn/conf.py | 1 + docs/zh_cn/dataset_zoo.py | 54 ++++++++++++++++++ docs/zh_cn/index.rst | 1 + .../data_prepare/dataset_preparer.md | 49 ++++++++++++++--- 11 files changed, 206 insertions(+), 15 deletions(-) create mode 100755 docs/en/dataset_zoo.py create mode 100755 docs/zh_cn/dataset_zoo.py diff --git a/dataset_zoo/icdar2015/metafile.yml b/dataset_zoo/icdar2015/metafile.yml index c04728754..11f20a8ab 100644 --- a/dataset_zoo/icdar2015/metafile.yml +++ b/dataset_zoo/icdar2015/metafile.yml @@ -26,3 +26,6 @@ Data: License: Type: CC BY 4.0 Link: https://creativecommons.org/licenses/by/4.0/ + Format: + - 'x1,y1,x2,y2,x3,y3,x4,y4,trans' + - 'img_name,trans' diff --git a/dataset_zoo/totaltext/metafile.yml b/dataset_zoo/totaltext/metafile.yml index 69777ee00..427f1f435 100644 --- a/dataset_zoo/totaltext/metafile.yml +++ b/dataset_zoo/totaltext/metafile.yml @@ -27,3 +27,5 @@ Data: License: Type: BSD-3 Link: https://github.com/cs-chan/Total-Text-Dataset/blob/master/LICENSE + Format: + - "x: [[x1 x2 x3 ... xn]], y: [[y1 y2 y3 ... yn]], ornt: [u'c'], transcriptions: [u'transcription']" diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml index efc7f44eb..f617ac671 100644 --- a/dataset_zoo/wildreceipt/metafile.yml +++ b/dataset_zoo/wildreceipt/metafile.yml @@ -27,3 +27,10 @@ Data: License: Type: N/A Link: N/A + Format: + - "{'file_name': 'xxx/xxx/xx/xxxx.jpeg', 'height': 1200, 'width': 1600, + 'annotations': [ + 'box': [x1, y1, x2, y2, x3, y3, x4, y4], + 'text': 'xxx', + 'label': 25, + ]}" diff --git a/docs/en/conf.py b/docs/en/conf.py index 255c071b4..6c62152b8 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -173,6 +173,7 @@ def builder_inited_handler(app): subprocess.run(['./merge_docs.sh']) subprocess.run(['./stats.py']) + subprocess.run(['./dataset_zoo.py']) def setup(app): diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py new file mode 100755 index 000000000..9b8e55053 --- /dev/null +++ b/docs/en/dataset_zoo.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +import os +import os.path as osp + +import yaml + +dataset_zoo_path = '../../dataset_zoo' +datasets = os.listdir(dataset_zoo_path) +datasets.sort() + +table = '# Dataset Zoo\n' +table += '## Supported Datasets\n' +table += '| Dataset Name | Text Detection | Text Recognition | Text Spotting | KIE |\n' \ + '|--------------|----------------|------------------|---------------|-----|\n' # noqa: E501 +details = '## Dataset Details\n' + +for dataset in datasets: + meta = yaml.safe_load( + open(osp.join(dataset_zoo_path, dataset, 'metafile.yml'))) + dataset_name = meta['Name'] + paper = meta['Paper'] + data = meta['Data'] + + table += '| [{}](#{}) | {} | {} | {} | {} |\n'.format( + dataset, + dataset_name.lower().replace(' ', '-'), + '✓' if 'textdet' in data['Tasks'] else '', + '✓' if 'textrecog' in data['Tasks'] else '', + '✓' if 'textspotting' in data['Tasks'] else '', + '✓' if 'kie' in data['Tasks'] else '', + ) + + details += '### {}\n'.format(dataset_name) + details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'], + paper['Year']) + + details += ' - Official Website: [{}]({})\n'.format( + dataset, data['Website']) + details += ' - Year: {}\n'.format(paper['Year']) + details += ' - Language: {}\n'.format(data['Language']) + details += ' - Scene: {}\n'.format(data['Scene']) + details += ' - Annotation Granularity: {}\n'.format(data['Granularity']) + details += ' - Supported Tasks: {}\n'.format(data['Tasks']) + details += ' - License: [{}]({})\n'.format(data['License']['Type'], + data['License']['Link']) + details += ' - Annotation Format:\n' + for format in data['Format']: + details += ' - {}\n'.format(format) + + details += '```bibtex\n{}\n```\n'.format(paper['BibTeX']) + +datasetzoo = table + details + +with open('user_guides/data_prepare/datasetzoo.md', 'w') as f: + f.write(datasetzoo) diff --git a/docs/en/index.rst b/docs/en/index.rst index 2d68ce3f8..6d96c028b 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -51,6 +51,7 @@ You can switch between English and Chinese in the lower-left corner of the layou :maxdepth: 2 :caption: Dataset Zoo + user_guides/data_prepare/datasetzoo.md user_guides/data_prepare/dataset_preparer.md user_guides/data_prepare/det.md user_guides/data_prepare/recog.md diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index e08cd4ee9..bba04baf7 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -28,13 +28,7 @@ Also, the script supports preparing multiple datasets at the same time. For exam python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog ``` -The following table shows the supported datasets. - -| Dataset Name | Text Detection | Text Recognition | Text Spotting | KIE | -| ------------ | -------------- | ---------------- | ------------- | --- | -| icdar2015 | ✓ | ✓ | ✓ | | -| totaltext | ✓ | ✓ | ✓ | | -| wildreceipt | ✓ | ✓ | ✓ | ✓ | +To check the supported datasets in MMOCR, please refer to [Dataset Zoo](./datasetzoo.md). ## Advanced Usage @@ -59,6 +53,45 @@ dataset_zoo/ `metafile.yml` is the metafile of the dataset, which contains the basic information of the dataset, including the year of publication, the author of the paper, and other information such as license. The other files named by the task are the configuration files of the dataset preparer, which are used to configure the download, decompression, format conversion, etc. of the dataset. These configs are in Python format, and their usage is completely consistent with the configuration files in MMOCR repo. See [Configuration File Documentation](../config.md) for detailed usage. +#### Metafile + +Take the ICDAR2015 dataset as an example, the `metafile.yml` stores the basic information of the dataset: + +```yaml +Name: 'Incidental Scene Text IC15' +Paper: + Title: ICDAR 2015 Competition on Robust Reading + URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf + Venue: ICDAR + Year: '2015' + BibTeX: '@inproceedings{karatzas2015icdar, + title={ICDAR 2015 competition on robust reading}, + author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others}, + booktitle={2015 13th international conference on document analysis and recognition (ICDAR)}, + pages={1156--1160}, + year={2015}, + organization={IEEE}}' +Data: + Website: https://rrc.cvc.uab.es/?ch=4 + Language: + - English + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ +``` + +It is not mandatory to use the metafile in the dataset preparation process (so users can ignore this file when prepare private datasets), but in order to better understand the information of each public dataset, we recommend that users read the metafile before before preparing the dataset, which will help to understand whether the datasets meet their needs. + +#### Config of Dataset Preparer + Next, we will introduce the conventional fields and usage of the dataset preparer configuration files. In the configuration files, there are two fields `data_root` and `cache_path`, which are used to store the converted dataset and the temporary files such as the archived files downloaded during the data preparation process. diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index c6363abb6..c9559ceb8 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -171,6 +171,7 @@ def builder_inited_handler(app): subprocess.run(['./cp_origin_docs.sh']) subprocess.run(['./merge_docs.sh']) subprocess.run(['./stats.py']) + subprocess.run(['./dataset_zoo.py']) def setup(app): diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py new file mode 100755 index 000000000..b577b3948 --- /dev/null +++ b/docs/zh_cn/dataset_zoo.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +import os +import os.path as osp + +import yaml + +dataset_zoo_path = '../../dataset_zoo' +datasets = os.listdir(dataset_zoo_path) +datasets.sort() + +table = '# 支持数据集一览\n' +table += '## 支持的数据集\n' +table += '| 数据集名称 | 文本检测 | 文本识别 | 端到端文本检测识别 | 关键信息抽取 |\n' \ + '|----------|---------|--------|------------------|-----------|\n' +details = '## 数据集详情\n' + +for dataset in datasets: + meta = yaml.safe_load( + open(osp.join(dataset_zoo_path, dataset, 'metafile.yml'))) + dataset_name = meta['Name'] + paper = meta['Paper'] + data = meta['Data'] + + table += '| [{}](#{}) | {} | {} | {} | {} |\n'.format( + dataset, + dataset_name.lower().replace(' ', '-'), + '✓' if 'textdet' in data['Tasks'] else '', + '✓' if 'textrecog' in data['Tasks'] else '', + '✓' if 'textspotting' in data['Tasks'] else '', + '✓' if 'kie' in data['Tasks'] else '', + ) + + details += '### {}\n'.format(dataset_name) + details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'], + paper['Year']) + + details += ' - 官方网址: [{}]({})\n'.format(dataset, data['Website']) + details += ' - 发布年份: {}\n'.format(paper['Year']) + details += ' - 语言: {}\n'.format(data['Language']) + details += ' - 场景: {}\n'.format(data['Scene']) + details += ' - 标注粒度: {}\n'.format(data['Granularity']) + details += ' - 支持任务: {}\n'.format(data['Tasks']) + details += ' - 数据集许可证: [{}]({})\n'.format(data['License']['Type'], + data['License']['Link']) + details += ' - 标注格式:\n' + for format in data['Format']: + details += ' - {}\n'.format(format) + + details += '```bibtex\n{}\n```\n'.format(paper['BibTeX']) + +datasetzoo = table + details + +with open('user_guides/data_prepare/datasetzoo.md', 'w') as f: + f.write(datasetzoo) diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index a2761dbdd..01e6865fd 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -51,6 +51,7 @@ :maxdepth: 2 :caption: 数据集支持 + user_guides/data_prepare/datasetzoo.md user_guides/data_prepare/dataset_preparer.md user_guides/data_prepare/det.md user_guides/data_prepare/recog.md diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md index 014f1b572..bc03b49b2 100644 --- a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md +++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md @@ -28,13 +28,7 @@ python tools/dataset_converters/prepare_dataset.py icdar2015 --task textdet python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog ``` -下表展示了目前支持一键下载及格式转换的数据集。 - -| 数据集名称 | 文本检测任务 | 文本识别任务 | 端到端文本检测识别任务 | 关键信息抽取任务 | -| ----------- | ------------ | ------------ | ---------------------- | ---------------- | -| icdar2015 | ✓ | ✓ | ✓ | | -| totaltext | ✓ | ✓ | ✓ | | -| wildreceipt | ✓ | ✓ | ✓ | ✓ | +进一步了解 MMOCR 支持的数据集,您可以浏览[支持的数据集文档](./datasetzoo.md) ## 进阶用法 @@ -59,7 +53,46 @@ dataset_zoo/ 其中,`metafile.yml` 是数据集的元信息文件,其中存放了对应数据集的基本信息,包括发布年份,论文作者,以及版权等其他信息。其它以任务名命名的则是数据集准备脚本的配置文件,用于配置数据集的下载、解压、格式转换等操作。这些配置文件采用了 Python 格式,其使用方法与 MMOCR 算法库的其他配置文件完全一致,详见[配置文件文档](../config.md)。 -下面,我们将介绍数据集准备脚本配置文件的默认字段与使用方法。 +#### 数据集元文件 + +以数据集 ICDAR2015 为例,`metafile.yml` 中存储了基础的数据集信息: + +```yaml +Name: 'Incidental Scene Text IC15' +Paper: + Title: ICDAR 2015 Competition on Robust Reading + URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf + Venue: ICDAR + Year: '2015' + BibTeX: '@inproceedings{karatzas2015icdar, + title={ICDAR 2015 competition on robust reading}, + author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others}, + booktitle={2015 13th international conference on document analysis and recognition (ICDAR)}, + pages={1156--1160}, + year={2015}, + organization={IEEE}}' +Data: + Website: https://rrc.cvc.uab.es/?ch=4 + Language: + - English + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ +``` + +该文件在数据集准备过程中并不是强制要求的(因此用户在使用添加自己的私有数据集时可以忽略该文件),但为了用户更好地了解各个公开数据集的信息,我们建议用户在使用数据集准备脚本前阅读对应的元文件信息,以了解该数据集的特征是否符合用户需求。 + +#### 数据集准备脚本配置文件 + +下面,我们将介绍数据集准备脚本配置文件 `textXXX.py` 的默认字段与使用方法。 我们在配置文件中提供了 `data_root` 与 `cache_path` 两个默认字段,分别用于存放转换后的 MMOCR 格式的数据集文件,以及在数据准备过程中下载的压缩包等临时文件。 From 11f113eeafdbda03180907fc9a469bfad6f698ee Mon Sep 17 00:00:00 2001 From: xinyu Date: Thu, 27 Oct 2022 19:16:13 +0800 Subject: [PATCH 06/20] fix comments; update scripts --- dataset_zoo/icdar2015/metafile.yml | 4 +- dataset_zoo/icdar2015/sample_anno.md | 19 ++++++++ dataset_zoo/totaltext/metafile.yml | 3 +- dataset_zoo/totaltext/sample_anno.md | 6 +++ dataset_zoo/wildreceipt/metafile.yml | 8 +--- dataset_zoo/wildreceipt/sample_anno.md | 45 +++++++++++++++++++ docs/en/dataset_zoo.py | 19 ++++++-- .../data_prepare/dataset_preparer.md | 2 +- docs/zh_cn/dataset_zoo.py | 22 ++++++--- mmocr/datasets/preparers/data_converter.py | 25 +++++------ mmocr/datasets/preparers/data_obtainer.py | 3 ++ mmocr/datasets/preparers/data_preparer.py | 2 + 12 files changed, 121 insertions(+), 37 deletions(-) create mode 100644 dataset_zoo/icdar2015/sample_anno.md create mode 100644 dataset_zoo/totaltext/sample_anno.md create mode 100644 dataset_zoo/wildreceipt/sample_anno.md diff --git a/dataset_zoo/icdar2015/metafile.yml b/dataset_zoo/icdar2015/metafile.yml index 11f20a8ab..fa4c24dbc 100644 --- a/dataset_zoo/icdar2015/metafile.yml +++ b/dataset_zoo/icdar2015/metafile.yml @@ -26,6 +26,4 @@ Data: License: Type: CC BY 4.0 Link: https://creativecommons.org/licenses/by/4.0/ - Format: - - 'x1,y1,x2,y2,x3,y3,x4,y4,trans' - - 'img_name,trans' + Format: .txt diff --git a/dataset_zoo/icdar2015/sample_anno.md b/dataset_zoo/icdar2015/sample_anno.md new file mode 100644 index 000000000..73b37c3eb --- /dev/null +++ b/dataset_zoo/icdar2015/sample_anno.md @@ -0,0 +1,19 @@ +**Text Detection** + +```text +# x1,y1,x2,y2,x3,y3,x4,y4,trans + +377,117,463,117,465,130,378,130,Genaxis Theatre +493,115,519,115,519,131,493,131,[06] +374,155,409,155,409,170,374,170,### +``` + +**Text Recognition** + +```text +# img_name, "text" + +word_1.png, "Genaxis Theatre" +word_2.png, "[06]" +word_3.png, "62-03" +``` diff --git a/dataset_zoo/totaltext/metafile.yml b/dataset_zoo/totaltext/metafile.yml index 427f1f435..ea94f1602 100644 --- a/dataset_zoo/totaltext/metafile.yml +++ b/dataset_zoo/totaltext/metafile.yml @@ -27,5 +27,4 @@ Data: License: Type: BSD-3 Link: https://github.com/cs-chan/Total-Text-Dataset/blob/master/LICENSE - Format: - - "x: [[x1 x2 x3 ... xn]], y: [[y1 y2 y3 ... yn]], ornt: [u'c'], transcriptions: [u'transcription']" + Format: .txt diff --git a/dataset_zoo/totaltext/sample_anno.md b/dataset_zoo/totaltext/sample_anno.md new file mode 100644 index 000000000..51ce61a2b --- /dev/null +++ b/dataset_zoo/totaltext/sample_anno.md @@ -0,0 +1,6 @@ +**Text Detection/Spotting** + +```text +x: [[259 313 389 427 354 302]], y: [[542 462 417 459 507 582]], ornt: [u'c'], transcriptions: [u'PAUL'] +x: [[400 478 494 436]], y: [[398 380 448 465]], ornt: [u'#'], transcriptions: [u'#'] +``` diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml index f617ac671..29c6f600c 100644 --- a/dataset_zoo/wildreceipt/metafile.yml +++ b/dataset_zoo/wildreceipt/metafile.yml @@ -27,10 +27,4 @@ Data: License: Type: N/A Link: N/A - Format: - - "{'file_name': 'xxx/xxx/xx/xxxx.jpeg', 'height': 1200, 'width': 1600, - 'annotations': [ - 'box': [x1, y1, x2, y2, x3, y3, x4, y4], - 'text': 'xxx', - 'label': 25, - ]}" + Format: .txt diff --git a/dataset_zoo/wildreceipt/sample_anno.md b/dataset_zoo/wildreceipt/sample_anno.md new file mode 100644 index 000000000..e5a0dcea2 --- /dev/null +++ b/dataset_zoo/wildreceipt/sample_anno.md @@ -0,0 +1,45 @@ +**KIE** + +```json +// Close Set +{ + "file_name": "image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg", + "height": 1200, + "width": 1600, + "annotations": + [ + { + "box": [550.0, 190.0, 937.0, 190.0, 937.0, 104.0, 550.0, 104.0], + "text": "SAFEWAY", + "label": 1 + }, + { + "box": [1048.0, 211.0, 1074.0, 211.0, 1074.0, 196.0, 1048.0, 196.0], + "text": "TM", + "label": 25 + } + ], //... +} + +// Open Set +{ + "file_name": "image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg", + "height": 348, + "width": 348, + "annotations": + [ + { + "box": [114.0, 19.0, 230.0, 19.0, 230.0, 1.0, 114.0, 1.0], + "text": "CHOEUN", + "label": 2, + "edge": 1 + }, + { + "box": [97.0, 35.0, 236.0, 35.0, 236.0, 19.0, 97.0, 19.0], + "text": "KOREANRESTAURANT", + "label": 2, + "edge": 1 + } + ] +} +``` diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py index 9b8e55053..01e47c2d5 100755 --- a/docs/en/dataset_zoo.py +++ b/docs/en/dataset_zoo.py @@ -8,7 +8,7 @@ datasets = os.listdir(dataset_zoo_path) datasets.sort() -table = '# Dataset Zoo\n' +table = '# Overview\n' table += '## Supported Datasets\n' table += '| Dataset Name | Text Detection | Text Recognition | Text Spotting | KIE |\n' \ '|--------------|----------------|------------------|---------------|-----|\n' # noqa: E501 @@ -34,6 +34,8 @@ details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'], paper['Year']) + # Basic Info + details += 'A. Basic Info\n' details += ' - Official Website: [{}]({})\n'.format( dataset, data['Website']) details += ' - Year: {}\n'.format(paper['Year']) @@ -43,10 +45,19 @@ details += ' - Supported Tasks: {}\n'.format(data['Tasks']) details += ' - License: [{}]({})\n'.format(data['License']['Type'], data['License']['Link']) - details += ' - Annotation Format:\n' - for format in data['Format']: - details += ' - {}\n'.format(format) + # Format + details += '
B. Annotation Format\n\n
' + sample_path = osp.join(dataset_zoo_path, dataset, 'sample_anno.md') + if osp.exists(sample_path): + with open(sample_path, 'r') as f: + samples = f.readlines() + samples = ''.join(samples) + details += samples + details += '
\n\n
' + + # Reference + details += 'C. Reference\n' details += '```bibtex\n{}\n```\n'.format(paper['BibTeX']) datasetzoo = table + details diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index bba04baf7..08ca654f1 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -175,7 +175,7 @@ one-to-many Therefore, we provide two built-in gatherers, `pair_gather` and `mono_gather`, to handle the two cases. `pair_gather` is used for the case of many-to-many, and `mono_gather` is used for the case of one-to-many. `pair_gather` needs to specify the `suffixes` parameter to indicate the suffix of the image, such as `suffixes=[.jpg,.JPG]` in the above example. In addition, we need to specify the corresponding relationship between the image and the annotation file through the regular expression, such as `rule=[r'img_(\d+)\.([jJ][pP][gG])',r'gt_img_\1.txt']` in the above example. Where `\d+` is used to match the serial number of the image, `([jJ][pP][gG])` is used to match the suffix of the image, and `\_1` matches the serial number of the image and the serial number of the annotation file. -When the image and annotation file are matched, the the original annotations will be parsed. Since the annotation format is usually varied from datasets to datasets, the parsers are usually dataset related. Then, the parser will pack the required data into the MMOCR format. +When the image and annotation file are matched, the original annotations will be parsed. Since the annotation format is usually varied from dataset to dataset, the parsers are usually dataset related. Then, the parser will pack the required data into the MMOCR format. Finally, we can specify the dumpers to decide the data format. Currently, we only support `JsonDumper` and `WildreceiptOpensetDumper`, where the former is used to save the data in the standard MMOCR Json format, and the latter is used to save the data in the Wildreceipt format. In the future, we plan to support `LMDBDumper` to save the annotation files in LMDB format. diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py index b577b3948..972ea9311 100755 --- a/docs/zh_cn/dataset_zoo.py +++ b/docs/zh_cn/dataset_zoo.py @@ -33,19 +33,29 @@ details += '### {}\n'.format(dataset_name) details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'], paper['Year']) - + # Basic Info + details += 'A. 数据集基础信息\n' details += ' - 官方网址: [{}]({})\n'.format(dataset, data['Website']) details += ' - 发布年份: {}\n'.format(paper['Year']) details += ' - 语言: {}\n'.format(data['Language']) details += ' - 场景: {}\n'.format(data['Scene']) details += ' - 标注粒度: {}\n'.format(data['Granularity']) details += ' - 支持任务: {}\n'.format(data['Tasks']) - details += ' - 数据集许可证: [{}]({})\n'.format(data['License']['Type'], - data['License']['Link']) - details += ' - 标注格式:\n' - for format in data['Format']: - details += ' - {}\n'.format(format) + details += ' - 数据集许可证: [{}]({})\n\n'.format(data['License']['Type'], + data['License']['Link']) + + # Format + details += '
B. 标注格式\n\n
' + sample_path = osp.join(dataset_zoo_path, dataset, 'sample_anno.md') + if osp.exists(sample_path): + with open(sample_path, 'r') as f: + samples = f.readlines() + samples = ''.join(samples) + details += samples + details += '
\n\n
' + # Reference + details += 'C. 参考文献\n' details += '```bibtex\n{}\n```\n'.format(paper['BibTeX']) datasetzoo = table + details diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index a1a2e883f..ffa680383 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -211,9 +211,9 @@ def pack_instance(self, - img_path (str): Path to the image file. - instances (Sequence[Dict]): A list of converted annos. Each element should be a dict with the following keys: - - 'poly' or 'box' - - 'ignore' - - 'bbox_label' (optional) + - 'poly' or 'box' + - 'ignore' + - 'bbox_label' (optional) split (str): The split of the instance. Returns: @@ -273,7 +273,7 @@ class TextSpottingDataConverter(BaseDataConverter): dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations]. + conversion. Defaults to ['annotations']. """ def __init__(self, @@ -305,10 +305,10 @@ def pack_instance(self, - img_path (str): Path to image file. - instances (Sequence[Dict]): A list of converted annos. Each element should be a dict with the following keys: - - 'poly' or 'box' - - 'text' - - 'ignore' - - 'bbox_label' (optional) + - 'poly' or 'box' + - 'text' + - 'ignore' + - 'bbox_label' (optional) split (str): The split of the instance. Returns: @@ -399,14 +399,11 @@ def pack_instance(self, sample: Tuple, split: str) -> Dict: Dict: The packed instance. """ - def pack(img_name: str, text: str, split: str) -> Dict: - return dict( - instances=[dict(text=text)], - img_path=osp.join(split, img_name)) - img_name, text = sample + packed_instance = dict( + instances=[dict(text=text)], img_path=osp.join(split, img_name)) - return pack(img_name, text, split) + return packed_instance def add_meta(self, sample: Dict) -> Dict: meta = { diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py index cfa9a2921..4349498bc 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -31,6 +31,7 @@ def __init__(self, files: List[Dict], cache_path: str, self.files = files self.cache_path = cache_path self.data_root = data_root + mkdir_or_exist(self.data_root) mkdir_or_exist(osp.join(self.data_root, 'imgs')) mkdir_or_exist(osp.join(self.data_root, 'annotations')) mkdir_or_exist(self.cache_path) @@ -78,6 +79,8 @@ def progress(down: float, block: float, size: float) -> None: ' Please manually download the required files' ' following the guides.') + print(f'Start to download {osp.basename(dst_path)}...') + print('If you stuck here for a long time, please check your network.') request.urlretrieve(url, dst_path, progress) def extract(self, diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 9f9f81857..bbc46a028 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -55,8 +55,10 @@ def __init__(self, def __call__(self): """Prepare the dataset.""" if self.with_obtainer: + print('Obtain Dataset...') self.data_obtainer() if self.with_converter: + print('Convert Dataset...') self.data_converter() def parse_meta(self, cfg_path: str) -> None: From 7e898f8d09e2a7ccb25c1ffa0df26111ca0a9c6a Mon Sep 17 00:00:00 2001 From: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> Date: Fri, 28 Oct 2022 10:33:05 +0800 Subject: [PATCH 07/20] apply comments Co-authored-by: Tong Gao --- docs/en/user_guides/data_prepare/dataset_preparer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index 08ca654f1..6e92c73c1 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -88,7 +88,7 @@ Data: Link: https://creativecommons.org/licenses/by/4.0/ ``` -It is not mandatory to use the metafile in the dataset preparation process (so users can ignore this file when prepare private datasets), but in order to better understand the information of each public dataset, we recommend that users read the metafile before before preparing the dataset, which will help to understand whether the datasets meet their needs. +It is not mandatory to use the metafile in the dataset preparation process (so users can ignore this file when preparing private datasets), but in order to better understand the information of each public dataset, we recommend that users read the metafile before preparing the dataset, which will help to understand whether the datasets meet their needs. #### Config of Dataset Preparer From d9d748cfd12acb8cec6d9487c049a171f5fdb432 Mon Sep 17 00:00:00 2001 From: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> Date: Fri, 28 Oct 2022 10:33:17 +0800 Subject: [PATCH 08/20] apply comments Co-authored-by: Tong Gao --- tools/dataset_converters/prepare_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index d1403b342..84e813b95 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -20,7 +20,7 @@ def parse_args(): '--task', default='textdet', choices=['textdet', 'textrecog', 'textspotting', 'kie'], - help='Task type. Options are det and rec.') + help='Task type. Options are "textdet", "textrecog", "textspotting" and "kie".') parser.add_argument( '--dataset-zoo-path', default='./dataset_zoo', From 35d1612c568559ac164d2b8fc4746ea4e4897b32 Mon Sep 17 00:00:00 2001 From: liukuikun <641417025@qq.com> Date: Fri, 28 Oct 2022 17:43:54 +0800 Subject: [PATCH 09/20] coco parser --- mmocr/datasets/preparers/parsers/__init__.py | 4 +- .../datasets/preparers/parsers/coco_parser.py | 45 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 mmocr/datasets/preparers/parsers/coco_parser.py diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 1ee50a34c..832013501 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .coco_parser import COCOTextDetAnnParser from .ic15_parser import ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt import WildreceiptKIEAnnParser __all__ = [ 'ICDAR2015TextDetAnnParser', 'ICDAR2015TextRecogAnnParser', - 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser' + 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', + 'COCOTextDetAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/coco_parser.py b/mmocr/datasets/preparers/parsers/coco_parser.py new file mode 100644 index 000000000..82a02afca --- /dev/null +++ b/mmocr/datasets/preparers/parsers/coco_parser.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +from mmdet.datasets.api_wrappers import COCO + +from mmocr.datasets.preparers.data_preparer import DATA_PARSERS +from mmocr.datasets.preparers.parsers.base import BaseParser + + +@DATA_PARSERS.register_module() +class COCOTextDetAnnParser(BaseParser): + """COCO Text Detection Parser. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + def __init__(self, data_root=None, nproc: int = 1) -> None: + + super().__init__(nproc=nproc, data_root=data_root) + + def parse_files(self, files: Tuple, split: str = None) -> Dict: + """Parse single annotation.""" + samples = list() + coco = COCO(files) + img_ids = coco.get_img_ids() + + total_ann_ids = [] + for img_id in img_ids: + img_info = coco.load_imgs([img_id])[0] + img_info['img_id'] = img_id + img_path = img_info['file_name'] + ann_ids = coco.get_ann_ids(img_ids=[img_id]) + ann_info = coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + instances = list() + for ann in ann_info: + instances.append( + dict( + poly=ann['segmentation'][0], + text=ann.get('text', None), + ignore=ann.get('iscrowd', False))) + samples.append((img_path, instances)) + return samples From 7cd1f9b99a4d14109dd142cd8f42b67e43ec6d67 Mon Sep 17 00:00:00 2001 From: Xinyu Date: Sat, 29 Oct 2022 15:18:47 +0800 Subject: [PATCH 10/20] fix comments --- configs/textdet/_base_/datasets/icdar2015.py | 2 -- configs/textdet/_base_/datasets/totaltext.py | 12 ++++----- .../textrecog/_base_/datasets/icdar2015.py | 2 -- .../textrecog/_base_/datasets/totaltext.py | 12 ++++----- dataset_zoo/icdar2015/textdet.py | 4 +-- dataset_zoo/icdar2015/textrecog.py | 4 +-- dataset_zoo/totaltext/textdet.py | 6 ++--- mmocr/datasets/preparers/data_converter.py | 27 +++++++++++-------- mmocr/datasets/preparers/data_obtainer.py | 13 +++++---- mmocr/datasets/preparers/data_preparer.py | 5 ++-- mmocr/datasets/preparers/parsers/base.py | 19 ++++++++++--- .../datasets/preparers/parsers/ic15_parser.py | 4 +-- mmocr/utils/__init__.py | 6 ++--- mmocr/utils/fileio.py | 6 ++++- tools/dataset_converters/prepare_dataset.py | 3 ++- 15 files changed, 72 insertions(+), 53 deletions(-) diff --git a/configs/textdet/_base_/datasets/icdar2015.py b/configs/textdet/_base_/datasets/icdar2015.py index a14cb383a..41893ce9e 100644 --- a/configs/textdet/_base_/datasets/icdar2015.py +++ b/configs/textdet/_base_/datasets/icdar2015.py @@ -4,7 +4,6 @@ type='OCRDataset', data_root=ic15_det_data_root, ann_file='textdet_train.json', - data_prefix=dict(img_path='imgs/'), filter_cfg=dict(filter_empty_gt=True, min_size=32), pipeline=None) @@ -12,6 +11,5 @@ type='OCRDataset', data_root=ic15_det_data_root, ann_file='textdet_test.json', - data_prefix=dict(img_path='imgs/'), test_mode=True, pipeline=None) diff --git a/configs/textdet/_base_/datasets/totaltext.py b/configs/textdet/_base_/datasets/totaltext.py index b29ec6709..4884d297e 100644 --- a/configs/textdet/_base_/datasets/totaltext.py +++ b/configs/textdet/_base_/datasets/totaltext.py @@ -1,17 +1,15 @@ -ic15_det_data_root = 'data/totaltext' +tt_det_data_root = 'data/totaltext' -ic15_det_train = dict( +tt_det_train = dict( type='OCRDataset', - data_root=ic15_det_data_root, + data_root=tt_det_data_root, ann_file='textdet_train.json', - data_prefix=dict(img_path='imgs/'), filter_cfg=dict(filter_empty_gt=True, min_size=32), pipeline=None) -ic15_det_test = dict( +tt_det_test = dict( type='OCRDataset', - data_root=ic15_det_data_root, + data_root=tt_det_data_root, ann_file='textdet_test.json', - data_prefix=dict(img_path='imgs/'), test_mode=True, pipeline=None) diff --git a/configs/textrecog/_base_/datasets/icdar2015.py b/configs/textrecog/_base_/datasets/icdar2015.py index bb15546c4..b0d542827 100644 --- a/configs/textrecog/_base_/datasets/icdar2015.py +++ b/configs/textrecog/_base_/datasets/icdar2015.py @@ -4,7 +4,6 @@ type='OCRDataset', data_root=ic15_rec_data_root, ann_file='textrecog_train.json', - data_prefix=dict(img_path='crops/'), test_mode=False, pipeline=None) @@ -12,6 +11,5 @@ type='OCRDataset', data_root=ic15_rec_data_root, ann_file='textrecog_test.json', - data_prefix=dict(img_path='crops/'), test_mode=True, pipeline=None) diff --git a/configs/textrecog/_base_/datasets/totaltext.py b/configs/textrecog/_base_/datasets/totaltext.py index 7eb7478a9..b1e10e751 100644 --- a/configs/textrecog/_base_/datasets/totaltext.py +++ b/configs/textrecog/_base_/datasets/totaltext.py @@ -1,17 +1,15 @@ -ic15_rec_data_root = 'data/totaltext/' +tt_rec_data_root = 'data/totaltext/' -ic15_rec_train = dict( +tt_rec_train = dict( type='OCRDataset', - data_root=ic15_rec_data_root, + data_root=tt_rec_data_root, ann_file='textrecog_train.json', - data_prefix=dict(img_path='crops/'), test_mode=False, pipeline=None) -ic15_rec_test = dict( +tt_rec_test = dict( type='OCRDataset', - data_root=ic15_rec_data_root, + data_root=tt_rec_data_root, ann_file='textrecog_test.json', - data_prefix=dict(img_path='crops/'), test_mode=True, pipeline=None) diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py index 1d78936b7..e3ae3b82a 100644 --- a/dataset_zoo/icdar2015/textdet.py +++ b/dataset_zoo/icdar2015/textdet.py @@ -12,14 +12,14 @@ md5='c51cbace155dcc4d98c8dd19d378f30d', split=['train'], content=['image'], - mapping=[['ic15_textdet_train_img', 'imgs/train']]), + mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]), dict( url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', save_name='ic15_textdet_test_img.zip', md5='97e4c1ddcf074ffcc75feff2b63c35dd', split=['test'], content=['image'], - mapping=[['ic15_textdet_test_img', 'imgs/test']]), + mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]), dict( url='https://rrc.cvc.uab.es/downloads/' 'ch4_training_localization_transcription_gt.zip', diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py index 30335ee10..c28b76b39 100644 --- a/dataset_zoo/icdar2015/textrecog.py +++ b/dataset_zoo/icdar2015/textrecog.py @@ -15,14 +15,14 @@ content=['image', 'annotation'], mapping=[[ 'ic15_textrecog_train_img_gt/gt.txt', 'annotations/train.txt' - ], ['ic15_textrecog_train_img_gt', 'crops/train']]), + ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]), dict( url='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip', save_name='ic15_textrecog_test_img.zip', md5='d7a71585f4cc69f89edbe534e7706d5d', split=['test'], content=['image'], - mapping=[['ic15_textrecog_test_img', 'crops/test']]), + mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]), dict( url='https://rrc.cvc.uab.es/downloads/' 'Challenge4_Test_Task3_GT.txt', diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py index 0471e4f3b..425909fac 100644 --- a/dataset_zoo/totaltext/textdet.py +++ b/dataset_zoo/totaltext/textdet.py @@ -13,13 +13,13 @@ md5='5b56d71a4005a333cf200ff35ce87f75', split=['train', 'test'], content=['image'], - mapping=[['totaltext/Images/Train', 'imgs/train'], - ['totaltext/Images/Test', 'imgs/test']]), + mapping=[['totaltext/Images/Train', 'textdet_imgs/train'], + ['totaltext/Images/Test', 'textdet_imgs/test']]), dict( url='https://universityofadelaide.box.com/shared/static/' '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip', save_name='txt_format.zip', - md5='97e4c1ddcf074ffcc75feff2b63c35dd', + md5='53377a83420b4a0244304467512134e8', split=['train', 'test'], content=['annotation'], mapping=[['txt_format/Train', 'annotations/train'], diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index ffa680383..41eb4fa04 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -10,7 +10,7 @@ import mmcv from mmengine import mkdir_or_exist, track_parallel_progress -from mmocr.utils import bbox2poly, crop_img, poly2bbox, retrieve_files +from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS @@ -45,6 +45,7 @@ def __init__(self, self.nproc = nproc self.task = task self.delete = delete + self.img_dir = f'{task}_imgs' parser.update(dict(nproc=nproc)) dumper.update(dict(task=task)) self.parser = DATA_PARSERS.build(parser) @@ -64,7 +65,7 @@ def __call__(self): for split in self.splits: print(f'Parsing {split} split...') # Gather the info such as file names required by parser - img_path = osp.join(self.data_root, 'imgs', split) + img_path = osp.join(self.data_root, self.img_dir, split) ann_path = osp.join(self.data_root, 'annotations') gatherer_args = dict( img_path=img_path, ann_path=ann_path, split=split) @@ -152,10 +153,10 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence, List[Tuple]: A list of tuples (img_path, ann_path). """ files = list() - for file in retrieve_files(img_path, suffixes): + for file in list_files(img_path, suffixes): file2 = re.sub(rule[0], rule[1], osp.basename(file)) file2 = file.replace(osp.basename(file), file2) - file2 = file2.replace('imgs', 'annotations') + file2 = file2.replace(self.img_dir, 'annotations') files.append((file, file2)) return files @@ -293,6 +294,8 @@ def __init__(self, nproc=nproc, delete=delete, task='textspotting') + # Textspotting task shares the same images with textdet task + self.img_dir = 'textdet_imgs' def pack_instance(self, sample: Tuple, @@ -401,7 +404,8 @@ def pack_instance(self, sample: Tuple, split: str) -> Dict: img_name, text = sample packed_instance = dict( - instances=[dict(text=text)], img_path=osp.join(split, img_name)) + instances=[dict(text=text)], + img_path=osp.join(self.img_dir, split, img_name)) return packed_instance @@ -439,8 +443,6 @@ class TextRecogCropConverter(TextRecogDataConverter): the cropped image. Defaults to 0.05. delete (Optional[List]): A list of files to be deleted after conversion. Defaults to ['annotations]. - crop_save_dir (str): The directory to save the cropped images. - Defaults to 'crops'. """ def __init__(self, @@ -452,8 +454,7 @@ def __init__(self, nproc: int, long_edge_pad_ratio: float = 0.1, short_edge_pad_ratio: float = 0.05, - delete: List = ['annotations'], - crop_save_path: str = 'crops'): + delete: List = ['annotations']): super().__init__( splits=splits, data_root=data_root, @@ -465,7 +466,10 @@ def __init__(self, self.ignore = self.parser.ignore self.lepr = long_edge_pad_ratio self.sepr = short_edge_pad_ratio - self.crop_save_path = osp.join(self.data_root, crop_save_path) + # Crop converter crops the images of textdet to patches + self.img_dir = 'textdet_imgs' + self.cropped_img_dir = 'textrecog_imgs' + self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir) mkdir_or_exist(self.crop_save_path) for split in splits: mkdir_or_exist(osp.join(self.crop_save_path, split)) @@ -503,7 +507,8 @@ def get_box(instance: Dict) -> List: dst_path = osp.join(self.crop_save_path, split, patch_name) mmcv.imwrite(patch, dst_path) rec_instance = dict( - instances=[dict(text=text)], img_path=f'{split}/{patch_name}') + instances=[dict(text=text)], + img_path=osp.join(self.cropped_img_dir, split, patch_name)) data_list.append(rec_instance) return data_list diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py index 4349498bc..de092d28d 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -26,19 +26,22 @@ class NaiveDataObtainer: data_root (str): The root path of the dataset. """ - def __init__(self, files: List[Dict], cache_path: str, - data_root: str) -> None: + def __init__(self, files: List[Dict], cache_path: str, data_root: str, + task: str) -> None: self.files = files self.cache_path = cache_path self.data_root = data_root + self.task = task mkdir_or_exist(self.data_root) - mkdir_or_exist(osp.join(self.data_root, 'imgs')) + mkdir_or_exist(osp.join(self.data_root, f'{task}_imgs')) mkdir_or_exist(osp.join(self.data_root, 'annotations')) mkdir_or_exist(self.cache_path) def __call__(self): for file in self.files: - save_name, url, md5 = file['save_name'], file['url'], file['md5'] + save_name = file.get('save_name', None) + url = file.get('url', None) + md5 = file.get('md5', None) download_path = osp.join( self.cache_path, osp.basename(url) if save_name is None else save_name) @@ -73,7 +76,7 @@ def progress(down: float, block: float, size: float) -> None: file_name = osp.basename(dst_path) print(f'\rDownloading {file_name}: {percent:.2f}%', end='') - if not url and not osp.exists(dst_path): + if url is None and not osp.exists(dst_path): raise FileNotFoundError( 'Direct url is not available for this dataset.' ' Please manually download the required files' diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index bbc46a028..6b7586ae5 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -55,10 +55,10 @@ def __init__(self, def __call__(self): """Prepare the dataset.""" if self.with_obtainer: - print('Obtain Dataset...') + print('Obtaining Dataset...') self.data_obtainer() if self.with_converter: - print('Convert Dataset...') + print('Converting Dataset...') self.data_converter() def parse_meta(self, cfg_path: str) -> None: @@ -99,6 +99,7 @@ def parse_cfg(self, cfg_path: str) -> None: cfg = Config.fromfile(osp.join(cfg_path, self.task + '.py')) if 'data_obtainer' in cfg: + cfg.data_obtainer.update(task=self.task) self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer) if 'data_converter' in cfg: cfg.data_converter.update(dict(nproc=self.nproc)) diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py index d6c93a01e..4228fa67a 100644 --- a/mmocr/datasets/preparers/parsers/base.py +++ b/mmocr/datasets/preparers/parsers/base.py @@ -38,7 +38,7 @@ def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: """Convert annotations to MMOCR format. Args: - files (Tuple): A tuple of path to image and annotation. + files (Tuple): A list of tuple of path to image and annotation. Returns: List[Tuple]: A list of a tuple of (image_path, instances) @@ -48,8 +48,21 @@ def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: return samples @abstractmethod - def parse_file(self, file: Tuple, split: str) -> Dict: - """Convert annotation for a single image.""" + def parse_file(self, file: Tuple, split: str) -> Tuple: + """Convert annotation for a single image. + + Args: + file (Tuple): A tuple of path to image and annotation + split (str): Current split. + + Returns: + Tuple: A tuple of (img_path, instance). Instance is a dict + containing parsed annotations, which should contain the + following keys: + - 'poly' or 'box' (textdet or textspotting) + - 'text' (textspotting or textrecog) + - 'ignore' (all task) + """ raise NotImplementedError def loader(self, diff --git a/mmocr/datasets/preparers/parsers/ic15_parser.py b/mmocr/datasets/preparers/parsers/ic15_parser.py index fff7b81e8..25b299038 100644 --- a/mmocr/datasets/preparers/parsers/ic15_parser.py +++ b/mmocr/datasets/preparers/parsers/ic15_parser.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Tuple +from typing import List, Tuple from ..data_preparer import DATA_PARSERS from .base import BaseParser @@ -37,7 +37,7 @@ def __init__(self, self.ignore = ignore super().__init__(nproc=nproc) - def parse_file(self, file: Tuple, split: str) -> Dict: + def parse_file(self, file: Tuple, split: str) -> Tuple: """Parse single annotation.""" img_file, txt_file = file instances = list() diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index 2e0173121..39dfabef4 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -6,8 +6,8 @@ is_type_list, valid_boundary) from .collect_env import collect_env from .data_converter_utils import dump_ocr_data, recog_anno_to_imginfo -from .fileio import (check_integrity, is_archive, list_from_file, list_to_file, - retrieve_files) +from .fileio import (check_integrity, is_archive, list_files, list_from_file, + list_to_file) from .img_utils import crop_img, warp_img from .mask_utils import fill_hole from .parsers import LineJsonParser, LineStrParser @@ -42,5 +42,5 @@ 'OptConfigType', 'OptDetSampleList', 'OptInitConfigType', 'OptMultiConfig', 'OptRecSampleList', 'RecSampleList', 'MultiConfig', 'OptTensor', 'ColorType', 'OptKIESampleList', 'KIESampleList', 'is_archive', - 'check_integrity', 'retrieve_files' + 'check_integrity', 'list_files' ] diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py index efd55680e..ee008bee2 100644 --- a/mmocr/utils/fileio.py +++ b/mmocr/utils/fileio.py @@ -3,6 +3,7 @@ import os.path as osp import sys from glob import glob +from logging import warning from typing import List from mmengine import mkdir_or_exist @@ -73,6 +74,9 @@ def check_integrity(file_path: str, Returns: bool: Whether the md5 is matched. """ + if md5 is None: + warning.warn('MD5 is None, skip the integrity check.') + return True if not osp.exists(file_path): return False @@ -87,7 +91,7 @@ def check_integrity(file_path: str, return hash.hexdigest() == md5 -def retrieve_files(path: str, suffixes: List) -> List: +def list_files(path: str, suffixes: List) -> List: """Retrieve file list from the path. Args: diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index 84e813b95..aee8ca9e5 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -20,7 +20,8 @@ def parse_args(): '--task', default='textdet', choices=['textdet', 'textrecog', 'textspotting', 'kie'], - help='Task type. Options are "textdet", "textrecog", "textspotting" and "kie".') + help='Task type. Options are "textdet", "textrecog", "textspotting"' + ' and "kie".') parser.add_argument( '--dataset-zoo-path', default='./dataset_zoo', From b380deee1684c2dc1cfe569449ad37248311bdce Mon Sep 17 00:00:00 2001 From: Xinyu Date: Sat, 29 Oct 2022 15:38:42 +0800 Subject: [PATCH 11/20] add fileio tests --- tests/test_utils/test_fileio.py | 56 ++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/tests/test_utils/test_fileio.py b/tests/test_utils/test_fileio.py index e3f929db1..14e2618f4 100644 --- a/tests/test_utils/test_fileio.py +++ b/tests/test_utils/test_fileio.py @@ -1,8 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import tempfile +import unittest -from mmocr.utils import list_from_file, list_to_file +from mmocr.utils import (check_integrity, is_archive, list_files, + list_from_file, list_to_file) lists = [ [], @@ -102,3 +104,55 @@ def test_list_from_file(): lines = list(map(str, lines)) assert len(lines) == len(lines2) assert all(line1 == line2 for line1, line2 in zip(lines, lines2)) + + +class TestIsArchive(unittest.TestCase): + + def setUp(self) -> None: + self.zip = 'data/annotations_123.zip' + self.tar = 'data/img.abc.tar' + self.targz = 'data/img12345_.tar.gz' + self.rar = '/m/abc/t.rar' + self.dir = '/a/b/c/' + + def test_is_archive(self): + # test zip + self.assertTrue(is_archive(self.zip)) + # test tar + self.assertTrue(is_archive(self.tar)) + # test tar.gz + self.assertTrue(is_archive(self.targz)) + # test rar + self.assertFalse(is_archive(self.rar)) + # test dir + self.assertFalse(is_archive(self.dir)) + + +class TestCheckIntegrity(unittest.TestCase): + + def setUp(self) -> None: + self.file1 = ('tests/data/det_toy_dataset/instances_test.json', + '77b17b0125996af519ef82aaacc8d96b') + self.file2 = ('tests/data/det_toy_dataset/imgs/test/img_1.jpg', + 'abc123') + + def test_check_integrity(self): + file, md5 = self.file1 + self.assertTrue(check_integrity(file, md5)) + file, md5 = self.file2 + self.assertFalse(check_integrity(file, md5)) + + +class TestListFiles(unittest.TestCase): + + def setUp(self) -> None: + self.path = 'tests/data/det_toy_dataset/imgs/test' + self.files = [] + for i in range(1, 11): + self.files.append(f'{self.path}/img_{i}.jpg') + self.files.sort() + + def test_check_integrity(self): + files = list_files(self.path, 'jpg') + files.sort() + self.assertEqual(files, self.files) From f9969104183a11c8e4c750e0e540f6f21ebed520 Mon Sep 17 00:00:00 2001 From: Xinyu Date: Sat, 29 Oct 2022 15:54:37 +0800 Subject: [PATCH 12/20] fix test --- tests/test_utils/test_fileio.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_utils/test_fileio.py b/tests/test_utils/test_fileio.py index 14e2618f4..b718ee7d8 100644 --- a/tests/test_utils/test_fileio.py +++ b/tests/test_utils/test_fileio.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json +import os import tempfile import unittest @@ -147,12 +148,10 @@ class TestListFiles(unittest.TestCase): def setUp(self) -> None: self.path = 'tests/data/det_toy_dataset/imgs/test' - self.files = [] - for i in range(1, 11): - self.files.append(f'{self.path}/img_{i}.jpg') - self.files.sort() def test_check_integrity(self): - files = list_files(self.path, 'jpg') - files.sort() - self.assertEqual(files, self.files) + suffix = 'jpg' + files = list_files(self.path, suffix) + for file in os.listdir(self.path): + if file.endswith(suffix): + self.assertIn(os.path.join(self.path, file), files) From 28dc21ef8a2f3d71e430c4a470dd9864d24d50d7 Mon Sep 17 00:00:00 2001 From: Xinyu Date: Mon, 31 Oct 2022 14:43:01 +0800 Subject: [PATCH 13/20] add tests for parsers and dumpers --- mmocr/datasets/preparers/dumpers/dumpers.py | 21 ++++++- mmocr/datasets/preparers/parsers/__init__.py | 2 +- .../{wildreceipt.py => wildreceipt_parser.py} | 0 .../test_dumpers/test_dumpers.py | 38 +++++++++++++ .../test_parsers/test_ic15_parsers.py | 55 ++++++++++++++++++ .../test_parsers/test_tt_parsers.py | 36 ++++++++++++ .../test_parsers/test_wildreceipt_parsers.py | 57 +++++++++++++++++++ 7 files changed, 205 insertions(+), 4 deletions(-) rename mmocr/datasets/preparers/parsers/{wildreceipt.py => wildreceipt_parser.py} (100%) create mode 100644 tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py create mode 100644 tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py create mode 100644 tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py create mode 100644 tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py index 93543cd88..d19142f78 100644 --- a/mmocr/datasets/preparers/dumpers/dumpers.py +++ b/mmocr/datasets/preparers/dumpers/dumpers.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from typing import List +from typing import Dict, List import mmengine @@ -13,9 +13,16 @@ class JsonDumper: def __init__(self, task: str) -> None: self.task = task - self.format = format - def dump(self, data: List, data_root: str, split: str) -> None: + def dump(self, data: Dict, data_root: str, split: str) -> None: + """Dump data to json file. + + Args: + data (Dict): Data to be dumped. + data_root (str): Root directory of data. + split (str): Split of data. + """ + dst_file = osp.join(data_root, f'{self.task}_{split}.json') mmengine.dump(data, dst_file) @@ -27,4 +34,12 @@ def __init__(self, task: str) -> None: self.task = task def dump(self, data: List, data_root: str, split: str) -> None: + """Dump data to txt file. + + Args: + data (List): Data to be dumped. + data_root (str): Root directory of data. + split (str): Split of data. + """ + list_to_file(osp.join(data_root, f'openset_{split}.txt'), data) diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 832013501..d296c025e 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -2,7 +2,7 @@ from .coco_parser import COCOTextDetAnnParser from .ic15_parser import ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser from .totaltext_parser import TotaltextTextDetAnnParser -from .wildreceipt import WildreceiptKIEAnnParser +from .wildreceipt_parser import WildreceiptKIEAnnParser __all__ = [ 'ICDAR2015TextDetAnnParser', 'ICDAR2015TextRecogAnnParser', diff --git a/mmocr/datasets/preparers/parsers/wildreceipt.py b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py similarity index 100% rename from mmocr/datasets/preparers/parsers/wildreceipt.py rename to mmocr/datasets/preparers/parsers/wildreceipt_parser.py diff --git a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py new file mode 100644 index 000000000..57e9a2f45 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers.dumpers import (JsonDumper, + WildreceiptOpensetDumper) + + +class TestDumpers(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def test_json_dumpers(self): + task, split = 'textdet', 'train' + fake_data = dict( + metainfo=dict( + dataset_type='TextDetDataset', + task_name='textdet', + category=[dict(id=0, name='text')])) + + dumper = JsonDumper(task) + dumper.dump(fake_data, self.root.name, split) + with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f: + data = json.load(f) + self.assertEqual(data, fake_data) + + def test_wildreceipt_dumper(self): + task, split = 'kie', 'train' + fake_data = ['test1', 'test2'] + + dumper = WildreceiptOpensetDumper(task) + dumper.dump(fake_data, self.root.name, split) + with open(osp.join(self.root.name, f'openset_{split}.txt'), 'r') as f: + data = f.read().splitlines() + self.assertEqual(data, fake_data) diff --git a/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py new file mode 100644 index 000000000..b5713aefb --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers.parsers.ic15_parser import ( + ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser) +from mmocr.utils import list_to_file + + +class TestIC15Parsers(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def _create_dummy_ic15_det(self): + fake_anno = [ + '377,117,463,117,465,130,378,130,Genaxis Theatre', + '493,115,519,115,519,131,493,131,[06]', + '374,155,409,155,409,170,374,170,###', + ] + ann_file = osp.join(self.root.name, 'ic15_det.txt') + list_to_file(ann_file, fake_anno) + return (osp.join(self.root.name, 'ic15_det.jpg'), ann_file) + + def _create_dummy_ic15_recog(self): + fake_anno = [ + 'word_1.png, "Genaxis Theatre"', + 'word_2.png, "[06]"', + 'word_3.png, "62-03"', + ] + ann_file = osp.join(self.root.name, 'ic15_recog.txt') + list_to_file(ann_file, fake_anno) + return ann_file + + def test_textdet_parsers(self): + parser = ICDAR2015TextDetAnnParser() + file = self._create_dummy_ic15_det() + img, instances = parser.parse_file(file, 'train') + self.assertEqual(img, file[0]) + self.assertEqual(len(instances), 3) + self.assertIn('poly', instances[0]) + self.assertIn('text', instances[0]) + self.assertIn('ignore', instances[0]) + self.assertEqual(instances[0]['text'], 'Genaxis Theatre') + self.assertEqual(instances[2]['ignore'], True) + + def test_textrecog_parsers(self): + parser = ICDAR2015TextRecogAnnParser() + file = self._create_dummy_ic15_recog() + samples = parser.parse_files(file, 'train') + self.assertEqual(len(samples), 3) + img, text = samples[0] + self.assertEqual(img, 'word_1.png') + self.assertEqual(text, 'Genaxis Theatre') diff --git a/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py new file mode 100644 index 000000000..713d7fc70 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers.parsers.totaltext_parser import \ + TotaltextTextDetAnnParser +from mmocr.utils import list_to_file + + +class TestTTParsers(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def _create_dummy_tt_det(self): + fake_anno = [ + "x: [[ 53 120 121 56]], y: [[446 443 456 458]], ornt: [u'h'], transcriptions: [u'PERUNDING']", # noqa: E501 + "x: [[123 165 166 125]], y: [[443 440 453 455]], ornt: [u'h'], transcriptions: [u'PENILAI']", # noqa: E501 + "x: [[168 179 179 167]], y: [[439 439 452 453]], ornt: [u'#'], transcriptions: [u'#']", # noqa: E501 + ] + ann_file = osp.join(self.root.name, 'tt_det.txt') + list_to_file(ann_file, fake_anno) + return (osp.join(self.root.name, 'tt_det.jpg'), ann_file) + + def test_textdet_parsers(self): + parser = TotaltextTextDetAnnParser(self.root.name) + file = self._create_dummy_tt_det() + img, instances = parser.parse_file(file, 'train') + self.assertEqual(img, file[0]) + self.assertEqual(len(instances), 3) + self.assertIn('poly', instances[0]) + self.assertIn('text', instances[0]) + self.assertIn('ignore', instances[0]) + self.assertEqual(instances[0]['text'], 'PERUNDING') + self.assertEqual(instances[2]['ignore'], True) diff --git a/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py new file mode 100644 index 000000000..f4e5510db --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers.parsers.wildreceipt_parser import ( + WildreceiptKIEAnnParser, WildreceiptTextDetAnnParser) +from mmocr.utils import list_to_file + + +class TestWildReceiptParsers(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + fake_sample = dict( + file_name='test.jpg', + height=100, + width=100, + annotations=[ + dict( + box=[ + 550.0, 190.0, 937.0, 190.0, 937.0, 104.0, 550.0, 104.0 + ], + text='test', + label=1, + ), + dict( + box=[ + 1048.0, 211.0, 1074.0, 211.0, 1074.0, 196.0, 1048.0, + 196.0 + ], + text='ATOREMGRTOMMILAZZO', + label=0, + ) + ]) + fake_sample = [json.dumps(fake_sample)] + self.anno = osp.join(self.root.name, 'wildreceipt.txt') + list_to_file(self.anno, fake_sample) + + def test_textdet_parsers(self): + parser = WildreceiptTextDetAnnParser(self.root.name) + samples = parser.parse_files(self.anno, 'train') + self.assertEqual(len(samples), 1) + self.assertEqual(osp.basename(samples[0][0]), 'test.jpg') + instances = samples[0][1] + self.assertEqual(len(instances), 2) + self.assertIn('poly', instances[0]) + self.assertIn('text', instances[0]) + self.assertIn('ignore', instances[0]) + self.assertEqual(instances[0]['text'], 'test') + self.assertEqual(instances[1]['ignore'], True) + + def test_kie_parsers(self): + parser = WildreceiptKIEAnnParser(self.root.name) + samples = parser.parse_files(self.anno, 'train') + self.assertEqual(len(samples), 1) From 882bda18cb548f1eecce97c3a5ab2b7b61932cc8 Mon Sep 17 00:00:00 2001 From: Xinyu Date: Mon, 31 Oct 2022 15:25:49 +0800 Subject: [PATCH 14/20] add test for data preparer --- mmocr/datasets/preparers/data_preparer.py | 4 +++- tests/data/preparer/dummy/metafile.yml | 24 +++++++++++++++++++ tests/data/preparer/dummy/textdet.py | 3 +++ .../test_preparers/test_data_preparer.py | 15 ++++++++++++ tests/test_utils/test_fileio.py | 4 ++++ 5 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/data/preparer/dummy/metafile.yml create mode 100644 tests/data/preparer/dummy/textdet.py create mode 100644 tests/test_datasets/test_preparers/test_data_preparer.py diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 6b7586ae5..8be649438 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -96,7 +96,9 @@ def parse_cfg(self, cfg_path: str) -> None: Args: cfg_path (str): Path to dataset config file. """ - cfg = Config.fromfile(osp.join(cfg_path, self.task + '.py')) + cfg_path = osp.join(cfg_path, self.task + '.py') + assert osp.exists(cfg_path), f'Config file {cfg_path} not found!' + cfg = Config.fromfile(cfg_path) if 'data_obtainer' in cfg: cfg.data_obtainer.update(task=self.task) diff --git a/tests/data/preparer/dummy/metafile.yml b/tests/data/preparer/dummy/metafile.yml new file mode 100644 index 000000000..7706ef53d --- /dev/null +++ b/tests/data/preparer/dummy/metafile.yml @@ -0,0 +1,24 @@ +Name: Dummy Dataset +Paper: + Title: Dummy Dataset + URL: https://github.com/open-mmlab/mmocr + Venue: MMOCR + Year: 2022 + BibTeX: '' +Data: + Website: https://github.com/open-mmlab/mmocr + Language: + - English + - Chinese + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .txt diff --git a/tests/data/preparer/dummy/textdet.py b/tests/data/preparer/dummy/textdet.py new file mode 100644 index 000000000..2fa11b202 --- /dev/null +++ b/tests/data/preparer/dummy/textdet.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +data_root = 'tests/data/preparer/dummy' +cache_path = 'tests/data/preparer/dummy' diff --git a/tests/test_datasets/test_preparers/test_data_preparer.py b/tests/test_datasets/test_preparers/test_data_preparer.py new file mode 100644 index 000000000..c531db6f0 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_data_preparer.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmocr.datasets.preparers.data_preparer import DatasetPreparer + + +class TestDataPreparer(unittest.TestCase): + + def setUp(self) -> None: + self.cfg_path = 'tests/data/preparer' + self.dataset_name = 'dummy' + + def test_dataset_preparer(self): + preparer = DatasetPreparer(self.cfg_path, self.dataset_name, 'textdet') + preparer() diff --git a/tests/test_utils/test_fileio.py b/tests/test_utils/test_fileio.py index b718ee7d8..2a2399d55 100644 --- a/tests/test_utils/test_fileio.py +++ b/tests/test_utils/test_fileio.py @@ -136,12 +136,16 @@ def setUp(self) -> None: '77b17b0125996af519ef82aaacc8d96b') self.file2 = ('tests/data/det_toy_dataset/imgs/test/img_1.jpg', 'abc123') + self.file3 = ('abc/abc.jpg', 'abc123') def test_check_integrity(self): file, md5 = self.file1 self.assertTrue(check_integrity(file, md5)) file, md5 = self.file2 self.assertFalse(check_integrity(file, md5)) + self.assertTrue(check_integrity(file, None)) + file, md5 = self.file3 + self.assertFalse(check_integrity(file, md5)) class TestListFiles(unittest.TestCase): From f8ac8f1a78b69fced4548bd605dcb9ce96bbc31a Mon Sep 17 00:00:00 2001 From: Xinyu Date: Mon, 31 Oct 2022 15:42:44 +0800 Subject: [PATCH 15/20] fix a bug --- mmocr/utils/fileio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py index ee008bee2..f7fad4ecf 100644 --- a/mmocr/utils/fileio.py +++ b/mmocr/utils/fileio.py @@ -2,8 +2,8 @@ import hashlib import os.path as osp import sys +import warnings from glob import glob -from logging import warning from typing import List from mmengine import mkdir_or_exist @@ -75,7 +75,7 @@ def check_integrity(file_path: str, bool: Whether the md5 is matched. """ if md5 is None: - warning.warn('MD5 is None, skip the integrity check.') + warnings.warn('MD5 is None, skip the integrity check.') return True if not osp.exists(file_path): return False From 49bf99da6a4a24bddce6136d4caa4f6ab2a446bb Mon Sep 17 00:00:00 2001 From: Xinyu Date: Tue, 1 Nov 2022 16:06:56 +0800 Subject: [PATCH 16/20] update icdar txt parser --- dataset_zoo/icdar2015/textdet.py | 2 +- dataset_zoo/icdar2015/textrecog.py | 2 +- .../user_guides/data_prepare/dataset_preparer.md | 2 +- .../user_guides/data_prepare/dataset_preparer.md | 2 +- mmocr/datasets/preparers/parsers/__init__.py | 4 ++-- mmocr/datasets/preparers/parsers/ic15_parser.py | 15 +++++++++++---- .../test_parsers/test_ic15_parsers.py | 6 +++--- 7 files changed, 20 insertions(+), 13 deletions(-) diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py index e3ae3b82a..3dfa6f76a 100644 --- a/dataset_zoo/icdar2015/textdet.py +++ b/dataset_zoo/icdar2015/textdet.py @@ -46,6 +46,6 @@ type='pair_gather', suffixes=['.jpg', '.JPG'], rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), - parser=dict(type='ICDAR2015TextDetAnnParser'), + parser=dict(type='ICDARTxtTextDetAnnParser'), dumper=dict(type='JsonDumper'), delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py index c28b76b39..5500bebd4 100644 --- a/dataset_zoo/icdar2015/textrecog.py +++ b/dataset_zoo/icdar2015/textrecog.py @@ -38,5 +38,5 @@ splits=['train', 'test'], data_root=data_root, gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"), - parser=dict(type='ICDAR2015TextRecogAnnParser'), + parser=dict(type='ICDARTxtTextRecogAnnParser'), dumper=dict(type='JsonDumper')) diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md index 6e92c73c1..940d6137f 100644 --- a/docs/en/user_guides/data_prepare/dataset_preparer.md +++ b/docs/en/user_guides/data_prepare/dataset_preparer.md @@ -139,7 +139,7 @@ data_converter = dict( type='pair_gather', suffixes=['.jpg', '.JPG'], rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), - parser=dict(type='ICDAR2015TextDetAnnParser'), + parser=dict(type='ICDARTxtTextDetAnnParser'), dumper=dict(type='JsonDumper'), delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) ``` diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md index bc03b49b2..8015db62a 100644 --- a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md +++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md @@ -139,7 +139,7 @@ data_converter = dict( type='pair_gather', suffixes=['.jpg', '.JPG'], rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), - parser=dict(type='ICDAR2015TextDetAnnParser'), + parser=dict(type='ICDARTxtTextDetAnnParser'), dumper=dict(type='JsonDumper'), delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) ``` diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index d296c025e..89de13157 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,11 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .coco_parser import COCOTextDetAnnParser -from .ic15_parser import ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser +from .ic15_parser import ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser __all__ = [ - 'ICDAR2015TextDetAnnParser', 'ICDAR2015TextRecogAnnParser', + 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/ic15_parser.py b/mmocr/datasets/preparers/parsers/ic15_parser.py index 25b299038..c33dbdd34 100644 --- a/mmocr/datasets/preparers/parsers/ic15_parser.py +++ b/mmocr/datasets/preparers/parsers/ic15_parser.py @@ -1,12 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Tuple +from typing import List, Optional, Tuple from ..data_preparer import DATA_PARSERS from .base import BaseParser @DATA_PARSERS.register_module() -class ICDAR2015TextDetAnnParser(BaseParser): +class ICDARTxtTextDetAnnParser(BaseParser): """ICDAR2015 Text Detection Parser. The original annotation format of this dataset is stored in txt files, @@ -23,6 +23,8 @@ class ICDAR2015TextDetAnnParser(BaseParser): 'utf-8-sig'. nproc (int): The number of processes to parse the annotation. Defaults to 1. + remove_flag (List[str], Optional): Used to remove redundant strings in + the transcription. Defaults to None. """ def __init__(self, @@ -30,11 +32,13 @@ def __init__(self, ignore: str = '###', format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', encoding: str = 'utf-8-sig', - nproc: int = 1) -> None: + nproc: int = 1, + remove_flag: Optional[List[str]] = None) -> None: self.sep = separator self.format = format self.encoding = encoding self.ignore = ignore + self.remove_flag = remove_flag super().__init__(nproc=nproc) def parse_file(self, file: Tuple, split: str) -> Tuple: @@ -46,6 +50,9 @@ def parse_file(self, file: Tuple, split: str) -> Tuple: anno = list(anno.values()) poly = list(map(float, anno[0:-1])) text = anno[-1] + if self.remove_flag is not None: + for flag in self.remove_flag: + text = text.replace(flag, '') instances.append( dict(poly=poly, text=text, ignore=text == self.ignore)) @@ -53,7 +60,7 @@ def parse_file(self, file: Tuple, split: str) -> Tuple: @DATA_PARSERS.register_module() -class ICDAR2015TextRecogAnnParser(BaseParser): +class ICDARTxtTextRecogAnnParser(BaseParser): """ICDAR2015 Text Detection Parser. The original annotation format of this dataset is stored in txt files, diff --git a/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py index b5713aefb..cdbd88b04 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py @@ -4,7 +4,7 @@ import unittest from mmocr.datasets.preparers.parsers.ic15_parser import ( - ICDAR2015TextDetAnnParser, ICDAR2015TextRecogAnnParser) + ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) from mmocr.utils import list_to_file @@ -34,7 +34,7 @@ def _create_dummy_ic15_recog(self): return ann_file def test_textdet_parsers(self): - parser = ICDAR2015TextDetAnnParser() + parser = ICDARTxtTextDetAnnParser() file = self._create_dummy_ic15_det() img, instances = parser.parse_file(file, 'train') self.assertEqual(img, file[0]) @@ -46,7 +46,7 @@ def test_textdet_parsers(self): self.assertEqual(instances[2]['ignore'], True) def test_textrecog_parsers(self): - parser = ICDAR2015TextRecogAnnParser() + parser = ICDARTxtTextRecogAnnParser() file = self._create_dummy_ic15_recog() samples = parser.parse_files(file, 'train') self.assertEqual(len(samples), 3) From 09bc4e2fd9c210d3f7cc33feab6a125cadbe9623 Mon Sep 17 00:00:00 2001 From: Xinyu Date: Tue, 1 Nov 2022 16:09:36 +0800 Subject: [PATCH 17/20] rename icdar txt parser --- mmocr/datasets/preparers/parsers/__init__.py | 3 ++- .../preparers/parsers/{ic15_parser.py => icdar_txt_parser.py} | 0 .../{test_ic15_parsers.py => test_icdar_txt_parsers.py} | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) rename mmocr/datasets/preparers/parsers/{ic15_parser.py => icdar_txt_parser.py} (100%) rename tests/test_datasets/test_preparers/test_parsers/{test_ic15_parsers.py => test_icdar_txt_parsers.py} (96%) diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 89de13157..83681eab5 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .coco_parser import COCOTextDetAnnParser -from .ic15_parser import ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser +from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, + ICDARTxtTextRecogAnnParser) from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser diff --git a/mmocr/datasets/preparers/parsers/ic15_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py similarity index 100% rename from mmocr/datasets/preparers/parsers/ic15_parser.py rename to mmocr/datasets/preparers/parsers/icdar_txt_parser.py diff --git a/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py similarity index 96% rename from tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py rename to tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py index cdbd88b04..e12820a69 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_ic15_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py @@ -3,7 +3,7 @@ import tempfile import unittest -from mmocr.datasets.preparers.parsers.ic15_parser import ( +from mmocr.datasets.preparers.parsers.icdar_txt_parser import ( ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) from mmocr.utils import list_to_file From b7043613ce3cead5987dac6655c6a262e765751d Mon Sep 17 00:00:00 2001 From: Xinyu Date: Tue, 1 Nov 2022 18:01:32 +0800 Subject: [PATCH 18/20] fix comments --- dataset_zoo/wildreceipt/metafile.yml | 2 +- mmocr/datasets/preparers/data_converter.py | 66 ++++++++++++++++++- mmocr/datasets/preparers/data_obtainer.py | 1 + mmocr/datasets/preparers/data_preparer.py | 4 +- mmocr/datasets/preparers/dumpers/dumpers.py | 34 +++++++++- .../datasets/preparers/parsers/coco_parser.py | 3 +- .../preparers/parsers/icdar_txt_parser.py | 14 ++-- 7 files changed, 109 insertions(+), 15 deletions(-) diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml index 29c6f600c..0196a6fe1 100644 --- a/dataset_zoo/wildreceipt/metafile.yml +++ b/dataset_zoo/wildreceipt/metafile.yml @@ -12,7 +12,7 @@ Paper: } ' Data: - Website: https://github.com/cs-chan/Total-Text-Dataset + Website: https://download.openmmlab.com/mmocr/data/wildreceipt.tar Language: - English Scene: diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index 41eb4fa04..fb819aed5 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -25,6 +25,7 @@ class BaseDataConverter: dumper (Dict): Config dict for dumping the dataset files. nproc (int): Number of processes to process the data. task (str): Task of the dataset. + dataset_name (str): Dataset name. delete (Optional[List]): A list of files to be deleted after conversion. """ @@ -37,17 +38,21 @@ def __init__(self, dumper: Dict, nproc: int, task: str, - delete: Optional[List] = None): + dataset_name: str, + delete: Optional[List] = None, + config_path: str = 'configs/'): assert isinstance(nproc, int) and nproc > 0, \ 'nproc must be a positive integer.' self.splits = splits self.data_root = data_root self.nproc = nproc self.task = task + self.dataset_name = dataset_name self.delete = delete + self.config_path = config_path self.img_dir = f'{task}_imgs' parser.update(dict(nproc=nproc)) - dumper.update(dict(task=task)) + dumper.update(dict(task=task, dataset_name=dataset_name)) self.parser = DATA_PARSERS.build(parser) self.dumper = DATA_DUMPERS.build(dumper) gather_type = gatherer.pop('type') @@ -62,6 +67,7 @@ def __init__(self, def __call__(self): """Process the data.""" # Convert and dump annotations to MMOCR format + dataset_config = dict() for split in self.splits: print(f'Parsing {split} split...') # Gather the info such as file names required by parser @@ -78,9 +84,49 @@ def __call__(self): samples = track_parallel_progress(func, samples, nproc=self.nproc) samples = self.add_meta(samples) # Dump annotation files - self.dumper.dump(samples, self.data_root, split) + dataset_config[split] = self.dumper.dump(samples, self.data_root, + split) + self.generate_dataset_config(dataset_config) self.clean() + def generate_dataset_config(self, dataset_config: Dict) -> None: + """Generate dataset config file. Dataset config is a python file that + contains the dataset information. + + Examples: + Generated dataset config + >>> ic15_rec_data_root = 'data/icdar2015/' + >>> ic15_rec_train = dict( + >>> type='OCRDataset', + >>> data_root=ic15_rec_data_root, + >>> ann_file='textrecog_train.json', + >>> test_mode=False, + >>> pipeline=None) + >>> ic15_rec_test = dict( + >>> type='OCRDataset', + >>> data_root=ic15_rec_data_root, + >>> ann_file='textrecog_test.json', + >>> test_mode=True, + >>> pipeline=None) + + Args: + dataset_config (Dict): A dict contains the dataset config string of + each split. + """ + if self.task == 'kie': + # Not supported yet + return + cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets', + f'{self.dataset_name}.py') + if not osp.exists(cfg_path): + with open(cfg_path, 'w') as f: + f.write( + f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n' # noqa: E501 + ) + for split in self.splits: + with open(cfg_path, 'a') as f: + f.write(dataset_config[split]) + @abstractmethod def pack_instance(self, sample: Tuple, split: str) -> Dict: """Pack the parsed annotation info to an MMOCR format instance. @@ -178,6 +224,7 @@ class TextDetDataConverter(BaseDataConverter): gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset files. dumper (Dict): Config dict for dumping the dataset files. + dataset_name (str): Name of the dataset. nproc (int): Number of processes to process the data. delete (Optional[List]): A list of files to be deleted after conversion. Defaults to ['annotations]. @@ -189,6 +236,7 @@ def __init__(self, gatherer: Dict, parser: Dict, dumper: Dict, + dataset_name: str, nproc: int, delete: List = ['annotations']) -> None: super().__init__( @@ -197,6 +245,7 @@ def __init__(self, gatherer=gatherer, parser=parser, dumper=dumper, + dataset_name=dataset_name, nproc=nproc, delete=delete, task='textdet') @@ -272,6 +321,7 @@ class TextSpottingDataConverter(BaseDataConverter): gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset files. dumper (Dict): Config dict for dumping the dataset files. + dataset_name (str): Name of the dataset. nproc (int): Number of processes to process the data. delete (Optional[List]): A list of files to be deleted after conversion. Defaults to ['annotations']. @@ -283,6 +333,7 @@ def __init__(self, gatherer: Dict, parser: Dict, dumper: Dict, + dataset_name: str, nproc: int, delete: List = ['annotations']) -> None: super().__init__( @@ -291,6 +342,7 @@ def __init__(self, gatherer=gatherer, parser=parser, dumper=dumper, + dataset_name=dataset_name, nproc=nproc, delete=delete, task='textspotting') @@ -368,6 +420,7 @@ class TextRecogDataConverter(BaseDataConverter): gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset annotations. dumper (Dict): Config dict for dumping the dataset files. + dataset_name (str): Name of the dataset. nproc (int): Number of processes to process the data. delete (Optional[List]): A list of files to be deleted after conversion. Defaults to ['annotations]. @@ -379,6 +432,7 @@ def __init__(self, gatherer: Dict, parser: Dict, dumper: Dict, + dataset_name: str, nproc: int, delete: List = ['annotations']): super().__init__( @@ -387,6 +441,7 @@ def __init__(self, gatherer=gatherer, parser=parser, dumper=dumper, + dataset_name=dataset_name, nproc=nproc, task='textrecog', delete=delete) @@ -436,6 +491,7 @@ class TextRecogCropConverter(TextRecogDataConverter): gatherer (Dict): Config dict for gathering the dataset files. parser (Dict): Config dict for parsing the dataset annotations. dumper (Dict): Config dict for dumping the dataset files. + dataset_name (str): Name of the dataset. nproc (int): Number of processes to process the data. long_edge_pad_ratio (float): The ratio of padding the long edge of the cropped image. Defaults to 0.1. @@ -451,6 +507,7 @@ def __init__(self, gatherer: Dict, parser: Dict, dumper: Dict, + dataset_name: str, nproc: int, long_edge_pad_ratio: float = 0.1, short_edge_pad_ratio: float = 0.05, @@ -461,6 +518,7 @@ def __init__(self, gatherer=gatherer, parser=parser, dumper=dumper, + dataset_name=dataset_name, nproc=nproc, delete=delete) self.ignore = self.parser.ignore @@ -540,6 +598,7 @@ def __init__(self, gatherer: Dict, parser: Dict, dumper: Dict, + dataset_name: str, nproc: int, delete: Optional[List] = None, merge_bg_others: bool = False, @@ -555,6 +614,7 @@ def __init__(self, gatherer=gatherer, parser=parser, dumper=dumper, + dataset_name=dataset_name, nproc=nproc, task='kie', delete=delete) diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py index de092d28d..43906967c 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -24,6 +24,7 @@ class NaiveDataObtainer: files (list[dict]): A list of file information. cache_path (str): The path to cache the downloaded files. data_root (str): The root path of the dataset. + task (str): The task of the dataset. """ def __init__(self, files: List[Dict], cache_path: str, data_root: str, diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 8be649438..98642aee8 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -49,6 +49,7 @@ def __init__(self, cfg_path = osp.join(cfg_path, dataset_name) self.nproc = nproc self.task = task + self.dataset_name = dataset_name self.parse_meta(cfg_path) self.parse_cfg(cfg_path) @@ -104,7 +105,8 @@ def parse_cfg(self, cfg_path: str) -> None: cfg.data_obtainer.update(task=self.task) self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer) if 'data_converter' in cfg: - cfg.data_converter.update(dict(nproc=self.nproc)) + cfg.data_converter.update( + dict(nproc=self.nproc, dataset_name=self.dataset_name)) self.data_converter = DATA_CONVERTERS.build(cfg.data_converter) @property diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py index d19142f78..8cc8d9a14 100644 --- a/mmocr/datasets/preparers/dumpers/dumpers.py +++ b/mmocr/datasets/preparers/dumpers/dumpers.py @@ -11,21 +11,47 @@ @DATA_DUMPERS.register_module() class JsonDumper: - def __init__(self, task: str) -> None: + def __init__(self, task: str, dataset_name: str) -> None: self.task = task + self.dataset_name = dataset_name - def dump(self, data: Dict, data_root: str, split: str) -> None: + def dump(self, data: Dict, data_root: str, split: str) -> str: """Dump data to json file. Args: data (Dict): Data to be dumped. data_root (str): Root directory of data. split (str): Split of data. + cfg_path (str): Path to configs. Defaults to 'configs/'. + + Returns: + str: String of dataset config. + + Examples: + The returned dataset config + >>> ic15_rec_train = dict( + >>> type='OCRDataset', + >>> data_root=ic15_rec_data_root, + >>> ann_file='textrecog_train.json', + >>> test_mode=False, + >>> pipeline=None) """ dst_file = osp.join(data_root, f'{self.task}_{split}.json') mmengine.dump(data, dst_file) + cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict (\n' + cfg += ' type=\'OCRDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{osp.basename(dst_file)}\',\n' + if split == 'train' and self.task == 'textdet': + cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' + elif split in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + + return cfg + @DATA_DUMPERS.register_module() class WildreceiptOpensetDumper: @@ -33,7 +59,7 @@ class WildreceiptOpensetDumper: def __init__(self, task: str) -> None: self.task = task - def dump(self, data: List, data_root: str, split: str) -> None: + def dump(self, data: List, data_root: str, split: str) -> str: """Dump data to txt file. Args: @@ -43,3 +69,5 @@ def dump(self, data: List, data_root: str, split: str) -> None: """ list_to_file(osp.join(data_root, f'openset_{split}.txt'), data) + + return None diff --git a/mmocr/datasets/preparers/parsers/coco_parser.py b/mmocr/datasets/preparers/parsers/coco_parser.py index 82a02afca..9b1cc8590 100644 --- a/mmocr/datasets/preparers/parsers/coco_parser.py +++ b/mmocr/datasets/preparers/parsers/coco_parser.py @@ -12,11 +12,12 @@ class COCOTextDetAnnParser(BaseParser): """COCO Text Detection Parser. Args: + data_root (str): The root path of the dataset. Defaults to None. nproc (int): The number of processes to parse the annotation. Defaults to 1. """ - def __init__(self, data_root=None, nproc: int = 1) -> None: + def __init__(self, data_root: str = None, nproc: int = 1) -> None: super().__init__(nproc=nproc, data_root=data_root) diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py index c33dbdd34..d69eb0d74 100644 --- a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py +++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py @@ -23,7 +23,7 @@ class ICDARTxtTextDetAnnParser(BaseParser): 'utf-8-sig'. nproc (int): The number of processes to parse the annotation. Defaults to 1. - remove_flag (List[str], Optional): Used to remove redundant strings in + remove_strs (List[str], Optional): Used to remove redundant strings in the transcription. Defaults to None. """ @@ -33,12 +33,12 @@ def __init__(self, format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', encoding: str = 'utf-8-sig', nproc: int = 1, - remove_flag: Optional[List[str]] = None) -> None: + remove_strs: Optional[List[str]] = None) -> None: self.sep = separator self.format = format self.encoding = encoding self.ignore = ignore - self.remove_flag = remove_flag + self.remove_strs = remove_strs super().__init__(nproc=nproc) def parse_file(self, file: Tuple, split: str) -> Tuple: @@ -48,11 +48,13 @@ def parse_file(self, file: Tuple, split: str) -> Tuple: for anno in self.loader(txt_file, self.sep, self.format, self.encoding): anno = list(anno.values()) + if self.remove_strs is not None: + for flag in self.remove_strs: + for i in range(len(anno)): + if flag in anno[i]: + anno[i] = anno[i].replace(flag, '') poly = list(map(float, anno[0:-1])) text = anno[-1] - if self.remove_flag is not None: - for flag in self.remove_flag: - text = text.replace(flag, '') instances.append( dict(poly=poly, text=text, ignore=text == self.ignore)) From 0130e1b9bb94f5da0bc2417da1b1c1c343cd97af Mon Sep 17 00:00:00 2001 From: Xinyu Date: Tue, 1 Nov 2022 18:13:28 +0800 Subject: [PATCH 19/20] fix test --- tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py index 57e9a2f45..512acedd3 100644 --- a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py +++ b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py @@ -21,7 +21,7 @@ def test_json_dumpers(self): task_name='textdet', category=[dict(id=0, name='text')])) - dumper = JsonDumper(task) + dumper = JsonDumper(task, dataset_name='test') dumper.dump(fake_data, self.root.name, split) with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f: data = json.load(f) From 24c39f87f826a828b6ae5adfbb428185779e635b Mon Sep 17 00:00:00 2001 From: Xinyu Date: Tue, 1 Nov 2022 19:55:04 +0800 Subject: [PATCH 20/20] fix comments --- mmocr/datasets/preparers/data_converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index fb819aed5..73d1e0eb0 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -28,6 +28,7 @@ class BaseDataConverter: dataset_name (str): Dataset name. delete (Optional[List]): A list of files to be deleted after conversion. + config_path (str): Path to the configs. Defaults to 'configs/'. """ def __init__(self,