From 5157daee428e5c5e6a70b1003ed8151ea1431832 Mon Sep 17 00:00:00 2001 From: gustinzhang Date: Sat, 31 Oct 2020 23:58:02 +0800 Subject: [PATCH] demo --- get_mess_from_gff.py | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 get_mess_from_gff.py diff --git a/get_mess_from_gff.py b/get_mess_from_gff.py new file mode 100644 index 0000000..910bb33 --- /dev/null +++ b/get_mess_from_gff.py @@ -0,0 +1,45 @@ + +import sys +import Make_Transcript_Class + +fr_gff_file_in = "temp.gff" + +transcript_mess_dir = {} + + +with open(fr_gff_file_in, 'r') as fr_gff_file: + for i in fr_gff_file: + if i.startswith('#'): + continue + else: + items = i.strip().split('\t') + if items[2] == "gene": + gene_id = items[-1].split(';')[0].split("ID=")[1] + elif items[2] == "mRNA": + transcript_id = items[-1].split(';')[0].split("ID=")[1] + transcript_mess_dir[transcript_id] = Make_Transcript_Class.Make_Transcript_Class(items[6], gene_id, transcript_id) + elif items[2] == "five_prime_UTR": + five_prime_UTR_start_site = int(items[3]) + five_prime_UTR_end_site = int(items[4]) + five_prime_UTR_mess = (five_prime_UTR_start_site, five_prime_UTR_end_site) + transcript_mess_dir[transcript_id].UTR_mess.append(five_prime_UTR_mess) + elif items[2] == "exon": + exon_start_site = int(items[3]) + exon_end_site = int(items[4]) + exon_mess = (exon_start_site, exon_end_site) + transcript_mess_dir[transcript_id].exon_mess.append(exon_mess) + elif items[2] == "three_prime_UTR": + three_prime_UTR_start_site = int(items[3]) + three_prime_UTR_end_site = int(items[4]) + three_prime_UTR_mess = (three_prime_UTR_start_site, three_prime_UTR_end_site) + transcript_mess_dir[transcript_id].UTR_mess.append(three_prime_UTR_mess) + elif items[2] == "CDS": + CDS_start_site = int(items[3]) + CDS_end_site = int(items[4]) + CDS_mess = (CDS_start_site, CDS_end_site) + transcript_mess_dir[transcript_id].CDS_mess.append(CDS_mess) + + +print(len(transcript_mess_dir)) +print(transcript_mess_dir["BMSK10000080.1"].transcript_id) +print(len(transcript_mess_dir["BMSK10000080.1"].exon_mess)) \ No newline at end of file