Skip to content

Commit

Permalink
P3: Update textdet data conversion scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
gaotongxiao committed Jul 21, 2022
1 parent 3992f0d commit 1af7f94
Show file tree
Hide file tree
Showing 25 changed files with 104 additions and 137 deletions.
20 changes: 9 additions & 11 deletions mmocr/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type,
is_type_list, valid_boundary)
from .collect_env import collect_env
from .data_convert_util import (convert_annotations, dump_ocr_data,
recog_anno_to_imginfo)
from .data_convert_util import dump_ocr_data, recog_anno_to_imginfo
from .fileio import list_from_file, list_to_file
from .img_util import drop_orientation, is_not_png
from .lmdb_util import recog2lmdb
Expand All @@ -25,13 +24,12 @@
__all__ = [
'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env',
'is_3dlist', 'is_type_list', 'is_none_or_type', 'equal_len', 'is_2dlist',
'valid_boundary', 'drop_orientation', 'convert_annotations', 'is_not_png',
'list_to_file', 'list_from_file', 'is_on_same_line',
'stitch_boxes_into_lines', 'StringStrip', 'revert_sync_batchnorm',
'bezier_to_polygon', 'sort_points', 'recog2lmdb', 'dump_ocr_data',
'recog_anno_to_imginfo', 'rescale_polygons', 'rescale_polygon',
'rescale_bboxes', 'bbox2poly', 'crop_polygon', 'is_poly_inside_rect',
'poly2bbox', 'poly_intersection', 'poly_iou', 'poly_make_valid',
'poly_union', 'poly2shapely', 'polys2shapely', 'register_all_modules',
'dist_points2line', 'offset_polygon'
'valid_boundary', 'drop_orientation', 'is_not_png', 'list_to_file',
'list_from_file', 'is_on_same_line', 'stitch_boxes_into_lines',
'StringStrip', 'revert_sync_batchnorm', 'bezier_to_polygon', 'sort_points',
'recog2lmdb', 'dump_ocr_data', 'recog_anno_to_imginfo', 'rescale_polygons',
'rescale_polygon', 'rescale_bboxes', 'bbox2poly', 'crop_polygon',
'is_poly_inside_rect', 'poly2bbox', 'poly_intersection', 'poly_iou',
'poly_make_valid', 'poly_union', 'poly2shapely', 'polys2shapely',
'register_all_modules', 'dist_points2line', 'offset_polygon'
]
41 changes: 0 additions & 41 deletions mmocr/utils/data_convert_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,6 @@
from mmocr.utils import is_type_list


# TODO: Remove it when all converters no longer need it
def convert_annotations(image_infos, out_json_name):
"""Convert the annotation into coco style.
Args:
image_infos(list): The list of image information dicts
out_json_name(str): The output json filename
Returns:
out_json(dict): The coco style dict
"""
assert isinstance(image_infos, list)
assert isinstance(out_json_name, str)
assert out_json_name

out_json = dict()
img_id = 0
ann_id = 0
out_json['images'] = []
out_json['categories'] = []
out_json['annotations'] = []
for image_info in image_infos:
image_info['id'] = img_id
anno_infos = image_info.pop('anno_info')
out_json['images'].append(image_info)
for anno_info in anno_infos:
anno_info['image_id'] = img_id
anno_info['id'] = ann_id
out_json['annotations'].append(anno_info)
ann_id += 1
img_id += 1
cat = dict(id=1, name='text')
out_json['categories'].append(cat)

if len(out_json['annotations']) == 0:
out_json.pop('annotations')
mmcv.dump(out_json, out_json_name)

return out_json


def dump_ocr_data(image_infos: Sequence[Dict], out_json_name: str,
task_name: str) -> Dict:
"""Dump the annotation in openmmlab style.
Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/bid_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir):
Expand Down Expand Up @@ -175,9 +175,9 @@ def main():
image_infos = [image_infos]
splits = ['training']
for i, split in enumerate(splits):
convert_annotations(
image_infos[i],
osp.join(root_path, 'instances_' + split + '.json'))
dump_ocr_data(image_infos[i],
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
9 changes: 5 additions & 4 deletions tools/data/textdet/cocotext_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def parse_args():
Expand Down Expand Up @@ -111,11 +111,12 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_cocotext_info(root_path, 'train')
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
print('Processing validation set...')
val_infos = collect_cocotext_info(root_path, 'val')
convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')


Expand Down
4 changes: 2 additions & 2 deletions tools/data/textdet/ctw1500_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from shapely.geometry import Polygon

from mmocr.utils import convert_annotations, list_from_file
from mmocr.utils import dump_ocr_data, list_from_file


def collect_files(img_dir, gt_dir, split):
Expand Down Expand Up @@ -224,7 +224,7 @@ def main():
files = collect_files(
osp.join(img_dir, split), osp.join(gt_dir, split), split)
image_infos = collect_annotations(files, split, nproc=args.nproc)
convert_annotations(image_infos, osp.join(out_dir, json_name))
dump_ocr_data(image_infos, osp.join(out_dir, json_name), 'textdet')


if __name__ == '__main__':
Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/detext_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import mmcv
import numpy as np

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir):
Expand Down Expand Up @@ -152,9 +152,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/funsd_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir):
Expand Down Expand Up @@ -148,9 +148,9 @@ def main():
osp.join(root_path, 'imgs'),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
9 changes: 5 additions & 4 deletions tools/data/textdet/hiertext_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from shapely.geometry import Polygon

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_level_info(annotation):
Expand Down Expand Up @@ -139,11 +139,12 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_hiertext_info(root_path, args.level, 'train')
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
print('Processing validation set...')
val_infos = collect_hiertext_info(root_path, args.level, 'val')
convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')


Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/ic11_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import mmcv
from PIL import Image

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def convert_gif(img_path):
Expand Down Expand Up @@ -163,9 +163,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/ic13_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir, split):
Expand Down Expand Up @@ -156,9 +156,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split), split)
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions tools/data/textdet/icdar_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np
from shapely.geometry import Polygon

from mmocr.utils import convert_annotations, list_from_file
from mmocr.utils import dump_ocr_data, list_from_file


def collect_files(img_dir, gt_dir):
Expand Down Expand Up @@ -176,7 +176,7 @@ def main():
osp.join(img_dir, split), osp.join(gt_dir, split))
image_infos = collect_annotations(
files, args.dataset, nproc=args.nproc)
convert_annotations(image_infos, osp.join(out_dir, json_name))
dump_ocr_data(image_infos, osp.join(out_dir, json_name), 'textdet')


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions tools/data/textdet/ilst_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir):
Expand Down Expand Up @@ -196,9 +196,9 @@ def main():
image_infos = [image_infos]
splits = ['training']
for i, split in enumerate(splits):
convert_annotations(
dump_ocr_data(
list(filter(None, image_infos[i])),
osp.join(root_path, 'instances_' + split + '.json'))
osp.join(root_path, 'instances_' + split + '.json'), 'textdet')


if __name__ == '__main__':
Expand Down
7 changes: 4 additions & 3 deletions tools/data/textdet/imgur_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import mmcv
import numpy as np

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def parse_args():
Expand Down Expand Up @@ -141,8 +141,9 @@ def main():
with mmcv.Timer(print_tmpl='It takes {}s to convert IMGUR annotation'):
anno_infos = collect_imgur_info(
root_path, f'imgur5k_annotations_{split}.json')
convert_annotations(anno_infos,
osp.join(root_path, f'instances_{split}.json'))
dump_ocr_data(anno_infos,
osp.join(root_path, f'instances_{split}.json'),
'textdet')


if __name__ == '__main__':
Expand Down
11 changes: 6 additions & 5 deletions tools/data/textdet/kaist_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(img_dir, gt_dir, ratio):
Expand Down Expand Up @@ -183,16 +183,17 @@ def main():
trn_infos = collect_annotations(trn_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert KAIST Training annotation'):
convert_annotations(trn_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(trn_infos, osp.join(root_path,
'instances_training.json'),
'textdet')

# Val set
if len(val_files) > 0:
val_infos = collect_annotations(val_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert KAIST Val annotation'):
convert_annotations(val_infos,
osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')


if __name__ == '__main__':
Expand Down
10 changes: 5 additions & 5 deletions tools/data/textdet/lsvt_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def parse_args():
Expand Down Expand Up @@ -115,13 +115,13 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_lsvt_info(root_path, 'train', args.val_ratio)
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
if args.val_ratio > 0:
print('Processing validation set...')
val_infos = collect_lsvt_info(root_path, 'val', args.val_ratio)
convert_annotations(val_infos, osp.join(root_path,
'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')


Expand Down
8 changes: 4 additions & 4 deletions tools/data/textdet/lv_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import mmcv

from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data


def collect_files(data_dir):
Expand Down Expand Up @@ -171,9 +171,9 @@ def main():
with mmcv.Timer(print_tmpl='It takes {}s to convert LV annotation'):
files = collect_files(osp.join(root_path, 'imgs', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 1af7f94

Please sign in to comment.