forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
se_epub_generate_toc.py
794 lines (640 loc) · 26.6 KB
/
se_epub_generate_toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
#!/usr/bin/env python3
"""
This module contains the make-toc function which tries to create a
valid table of contents file for SE projects.
Strictly speaking, the generate_toc() function should be a class member of SeEpub. But
the function is very big and it makes editing easier to put in a separate file.
"""
from enum import Enum
from typing import Tuple, List
import regex
import roman
from lxml import etree
import se
import se.formatting
import se.easy_xml
from se.easy_xml import EasyXmlTree, EasyXmlElement
class BookDivision(Enum):
"""
Enum to indicate the division of a particular ToC item.
"""
NONE = 0
ARTICLE = 1
SUBCHAPTER = 2
CHAPTER = 3
DIVISION = 4
PART = 5
VOLUME = 6
class Position(Enum):
"""
Enum to indicate whether a landmark is frontmatter, bodymatter or backmatter.
"""
NONE = 0
FRONT = 1
BODY = 2
BACK = 3
class TocItem:
"""
Small class to hold data on each table of contents item
found in the project.
"""
# pylint: disable=too-many-instance-attributes
file_link = ""
hidden = False # Did the <h#> element have the `hidden` attribute? If so, that means we MUST include it as a subtitle
level = 0
roman = ""
title = ""
subtitle = ""
title_is_ordinal = False
lang = ""
toc_id = ""
epub_type = ""
division = BookDivision.NONE
place = Position.FRONT
has_headers = True
@property
def toc_link(self) -> str:
"""
Generates the hyperlink for the ToC item.
INPUTS:
None
OUTPUTS:
the linking tag line eg <a href=... depending on the data found.
"""
out_string = ""
if not self.title:
raise se.InvalidInputException(f"Couldn’t find title in: [path][link=file://{self.file_link}]{self.file_link}[/][/].")
if self.subtitle and self.lang:
# test for a foreign language subtitle, and adjust accordingly
self.subtitle = f"<span xml:lang=\"{self.lang}\">{self.subtitle}</span>"
# If the title is entirely Roman numeral, put epub:type within <a>.
if regex.search(r"^<span epub:type=\"z3998:roman\">[IVXLC]+<\/span>$", self.title):
# title is a pure roman number
if self.subtitle == "": # put the roman flag inside the <a> tag
out_string += f"<a href=\"text/{self.file_link}\" epub:type=\"z3998:roman\">{self.roman}</a>\n"
else:
out_string += f"<a href=\"text/{self.file_link}\"><span epub:type=\"z3998:roman\">{self.roman}</span>: {self.subtitle}</a>\n"
else:
# title has text other than a roman numeral
if self.subtitle != "" and (self.hidden or self.title_is_ordinal or (self.division in [BookDivision.PART, BookDivision.DIVISION, BookDivision.VOLUME])):
# Use the subtitle only if we're a Part or Division or Volume or if title was an ordinal
out_string += f"<a href=\"text/{self.file_link}\">{self.title}"
# Don't append a colon if the ordinal already ends in punctuation, for example `1.` or `(a)`
if not regex.search(r"\p{Punctuation}$", self.title):
out_string += ":"
out_string += f" {self.subtitle}</a>\n"
else:
# test for a foreign language title, and adjust accordingly
if self.lang:
out_string += f"<a href=\"text/{self.file_link}\" xml:lang=\"{self.lang}\">{self.title}</a>\n"
else:
out_string += f"<a href=\"text/{self.file_link}\">{self.title}</a>\n"
# Replace <br/> with a single space
out_string = regex.sub(r"<br/>\s*", " ", out_string, flags=regex.DOTALL)
return out_string
def landmark_link(self, work_title: str = "WORK_TITLE") -> str:
"""
Generates the landmark item (including list item tags) for the ToC item
INPUTS:
work_type: ("fiction" or "non-fiction")
work_title: the title of the book, eg "Don Quixote"
OUTPUTS:
the linking string to be included in landmarks section.
"""
out_string = ""
if self.place == Position.FRONT:
out_string = f"<li>\n<a href=\"text/{self.file_link}\" epub:type=\"frontmatter {self.epub_type}\">{self.title}</a>\n</li>\n"
if self.place == Position.BODY:
out_string = f"<li>\n<a href=\"text/{self.file_link}\" epub:type=\"bodymatter\">{work_title}</a>\n</li>\n"
if self.place == Position.BACK:
out_string = f"<li>\n<a href=\"text/{self.file_link}\" epub:type=\"backmatter {self.epub_type}\">{self.title}</a>\n</li>\n"
return out_string
def get_place(node: EasyXmlElement) -> Position:
"""
Returns place of file in ebook, eg frontmatter, backmatter, etc.
INPUTS:
node: EasyXmlElement representation of the file
OUTPUTS:
a Position enum value indicating the place in the book
"""
epub_type = node.get_attr("epub:type")
if not epub_type:
return Position.NONE
if "backmatter" in epub_type:
retval = Position.BACK
elif "frontmatter" in epub_type:
retval = Position.FRONT
elif "bodymatter" in epub_type:
retval = Position.BODY
else:
retval = Position.NONE
return retval
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None:
"""
Adds an item to landmark list with appropriate details.
INPUTS:
dom: EasyXmlTree representation of the file we are indexing in ToC
textf: path to the file
landmarks: the list of landmark items we are building
OUTPUTS:
None
"""
# According to the IDPF a11y best practices page: <http://idpf.org/epub/a11y/techniques/#sem-003>:
# > it is recommended to include a link to the start of the body matter as well as to any major
# > reference sections (e.g., table of contents, endnotes, bibliography, glossary, index).
#
# So, we only want the start of the text, and (endnotes,glossary,bibliography,loi) in the landmarks.
epub_type = ""
sections = dom.xpath("//body/*[name() = 'section' or name() = 'article' or name() = 'nav']")
if not sections:
raise se.InvalidInputException("Couldn’t locate first [xhtml]<section>[/], [xhtml]<article>[/], or [xhtml]<nav>[/].")
epub_type = sections[0].get_attr("epub:type")
bodys = dom.xpath("//body")
if not bodys:
raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")
if not epub_type: # some productions don't have an epub:type in outermost section, so get it from body tag
epub_type = bodys[0].get_attr("epub:type")
if not epub_type:
epub_type = ""
if epub_type in ["frontmatter", "bodymatter", "backmatter"]:
return # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark
if dom.xpath("//*[contains(@epub:type, 'frontmatter')]"):
return # We don't want frontmatter in the landmarks
if dom.xpath("//*[contains(@epub:type, 'backmatter')]") and not regex.findall(r"\b(loi|endnotes|bibliography|glossary|index)\b", epub_type):
return # We only want certain backmatter in the landmarks
# We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later
epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type)
landmark = TocItem()
if epub_type:
landmark.epub_type = epub_type
landmark.file_link = textf
landmark.place = get_place(bodys[0])
landmark.has_headers = len(dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")) > 0
if epub_type == "halftitlepage":
landmark.title = "Half Title"
elif epub_type == "titlepage":
# Exception: The titlepage always has is titled 'titlepage' in the ToC
landmark.title = "Titlepage"
else:
landmark.title = dom.xpath("//head/title/text()", True) # Use the page title as the landmark entry title.
if landmark.title is None:
# This is a bit desperate, use this only if there's no proper <title> tag in file.
landmark.title = landmark.epub_type.capitalize()
landmarks.append(landmark)
def process_landmarks(landmarks_list: list, work_title: str) -> str:
"""
Runs through all found landmark items and writes them to the toc file.
INPUTS:
landmarks_list: the completed list of landmark items
work_type: "fiction" or "non-fiction"
work_title: the title of the book
"""
# we don't want frontmatter items to be included once we've started the body items
started_body = False
for item in landmarks_list:
if item.place == Position.BODY:
started_body = True
if started_body and item.place == Position.FRONT:
item.place = Position.NONE
front_items = [item for item in landmarks_list if item.place == Position.FRONT]
body_items = [item for item in landmarks_list if item.place == Position.BODY]
back_items = [item for item in landmarks_list if item.place == Position.BACK]
out_string = ""
for item in front_items:
out_string += item.landmark_link()
if body_items:
out_string += body_items[0].landmark_link(work_title) # Just the first bodymatter item.
for item in back_items:
out_string += item.landmark_link()
return out_string
def process_items(item_list: list) -> str:
"""
Runs through all found toc items and returns them as a string.
INPUTS:
item_list: list of ToC items
OUTPUTS:
A string representing (possibly nested) html lists of the structure of the ToC
"""
unclosed_ol = 0 # Keep track of how many ordered lists we open.
out_string = ""
# Process all but last item so we can look ahead.
for index in range(0, len(item_list) - 1): # Ignore very last item, which is a dummy.
this_item = item_list[index]
next_item = item_list[index + 1]
# Check to see if next item is at same, lower or higher level than us.
if next_item.level == this_item.level: # SIMPLE
out_string += "<li>\n"
out_string += this_item.toc_link
out_string += "</li>\n"
if next_item.level > this_item.level: # PARENT, start a new ol list
out_string += "<li>\n"
out_string += this_item.toc_link
out_string += "<ol>\n"
unclosed_ol += 1
if next_item.level < this_item.level: # LAST CHILD, close off the list
out_string += "<li>\n"
out_string += this_item.toc_link
out_string += "</li>\n" # Close off this item.
torepeat = this_item.level - next_item.level
while torepeat and unclosed_ol: # neither can go below zero
# We need to repeat a few times as may be jumping back from eg h5 to h2
out_string += "</ol>\n" # End of embedded list.
out_string += "</li>\n" # End of parent item.
unclosed_ol -= 1
torepeat -= 1
return out_string
def output_toc(item_list: list, landmark_list, toc_path: str, work_title: str) -> str:
"""
Outputs the contructed ToC based on the lists of items and landmarks found,
either to stdout or overwriting the existing ToC file
INPUTS:
item_list: list of ToC items (the first part of the ToC)
landmark_list: list of landmark items (the second part of the ToC)
work_type: "fiction" or "non-fiction"
work_title: the title of the book
OUTPUTS:
a html string representing the new ToC
"""
if len(item_list) < 2:
raise se.InvalidInputException("Too few ToC items found.")
try:
with open(toc_path, "r", encoding="utf-8") as file:
toc_dom = se.easy_xml.EasyXmlTree(file.read())
except Exception as ex:
raise se.InvalidInputException(f"Existing ToC not found. Exception: {ex}")
# There should be exactly two nav sections.
navs = toc_dom.xpath("//nav")
if len(navs) < 2:
raise se.InvalidInputException("Existing ToC has too few nav sections.")
# now remove and then re-add the ol sections to clear them
for nav in navs:
ols = nav.xpath("./ol") # just want the immediate ol children
for ol_item in ols:
ol_item.remove()
# this is ugly and stupid, but I can't figure out an easier way to do it
item_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
item_ol.lxml_element.text = "TOC_ITEMS"
navs[0].append(item_ol)
landmark_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
landmark_ol.lxml_element.text = "LANDMARK_ITEMS"
navs[1].append(landmark_ol)
xhtml = toc_dom.to_string()
xhtml = xhtml.replace("TOC_ITEMS", process_items(item_list))
xhtml = xhtml.replace("LANDMARK_ITEMS", process_landmarks(landmark_list, work_title))
return se.formatting.format_xhtml(xhtml)
def get_parent_id(hchild: EasyXmlElement) -> str:
"""
Climbs up the document tree looking for parent id in a <section> tag.
INPUTS:
hchild: a heading tag for which we want to find the parent id
OUTPUTS:
the id of the parent section
"""
# position() = 1 gets the nearest ancestor
parents = hchild.xpath("./ancestor::*[name() = 'section' or name() = 'article'][@id][position() = 1]")
if parents:
return parents[0].get_attr("id")
return ""
def extract_strings(node: EasyXmlElement) -> str:
"""
Returns string representation of a tag, ignoring linefeeds
INPUTS:
node: a tag as xpath node
OUTPUTS:
just the string contents of the tag
"""
out_string = node.inner_xml()
out_string = strip_notes(out_string)
out_string = out_string.replace("\n", "")
return regex.sub(r"[\n\t]", "", out_string)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list, single_file: bool, single_file_without_headers: bool) -> None:
"""
Find headings in current file and extract title data
into items added to toc_list.
INPUTS:
dom: an EasyXmlTree representation of the current file
textf: the path to the file
toc_list: the list of ToC items we are building
single_file: is there only a single content item in the production?
OUTPUTS:
None
"""
body = dom.xpath("//body")
place = Position.NONE
if body:
place = get_place(body[0])
else:
raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].")
is_toplevel = True
# Find all the hgroups and h1, h2 etc headings.
heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6")
# special treatment where we can't find any heading or hgroups
if not heads: # May be a dedication or an epigraph, with no heading tag.
special_item = TocItem()
# Need to determine level depth.
# We don't have a heading, so get first content item
content_item = dom.xpath("//p | //header | //img")
if content_item: # check to see if it has a data-parent, if so, we'll use that to determine depth
data_parent = content_item[0].xpath("//*[@data-parent]")
if data_parent:
special_item.level = get_level(content_item[0], toc_list)
else: # special items without data-parents get a default dept of 1
special_item.level = 1
else:
raise se.InvalidInputException(f"Unable to find heading or content item (p, header or img) in file: [path][link=file://{textf}]{textf}[/][/].")
special_item.title = dom.xpath("//head/title/text()", True) # Use the page title as the ToC entry title.
if special_item.title is None:
special_item.title = "NO TITLE"
special_item.file_link = textf
special_item.toc_id = get_toc_id_for_special_item(content_item[0])
if not special_item.toc_id: # no luck so use quick and dirty method
special_item.toc_id = textf.replace('.xhtml','')
special_item.place = place
toc_list.append(special_item)
return
for heading in heads:
# don't process a heading separately if it's within a hgroup
if heading.parent.tag == "hgroup":
continue # skip it
if place == Position.BODY:
toc_item = process_a_heading(heading, textf, is_toplevel, single_file)
else:
# if it's not a bodymatter item we don't care about whether it's single_file
toc_item = process_a_heading(heading, textf, is_toplevel, False)
toc_item.level = get_level(heading, toc_list)
toc_item.place = place
# Exception: The titlepage always has is titled 'titlepage' in the ToC
if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"):
toc_item.title = "Titlepage"
# Exception: If there is only a single body item WITHOUT HEADERS (like Father Goriot or The Path to Rome),
# the half title page is listed as "Half-Titlepage" instead of the work title,
# so that we don't duplicate the work title in the ToC. We always include a link to the work body
# in the ToC because readers on the web version need to have access to the text starting point, since
# there are no back/forward nav buttons in XHTML files served on the web.
if single_file_without_headers and dom.xpath("//section[re:test(@epub:type, '\\bhalftitlepage\\b')]"):
toc_item.title = "Half-Titlepage"
is_toplevel = False
toc_list.append(toc_item)
def get_toc_id_for_special_item(node: EasyXmlElement) -> str:
"""
Get the id for a 'special item' node
"""
parent_sections = node.xpath("./ancestor::*[name() = 'section' or name() = 'article']")
for parent in parent_sections:
toc_id = parent.get_attr("id")
if toc_id:
return toc_id
return ""
def get_level(node: EasyXmlElement, toc_list: list) -> int:
"""
Get level of a node.
"""
# first need to check how deep this heading is within the current file
parent_sections = node.xpath("./ancestor::*[name() = 'section' or name() = 'article']")
if parent_sections:
depth = len(parent_sections)
else:
depth = 1
if not node.parent:
return depth # must be at the top level
data_parents = node.xpath("//*[@data-parent]")
if not data_parents:
return depth
data_parent = data_parents[0].get_attr("data-parent")
if data_parent:
# see if we can find it in already processed (as we should if spine is correctly ordered)
parent_file = [t for t in toc_list if t.toc_id == data_parent]
if parent_file:
this_level = parent_file[0].level + 1
return this_level + depth - 1 # subtract from depth because all headings should have depth >= 1
return depth
def process_a_heading(node: EasyXmlElement, textf: str, is_toplevel: bool, single_file: bool) -> TocItem:
"""
Generate and return a single TocItem from this heading.
INPUTS:
node: an EasyXml node representing a heading
text: the path to the file
is_toplevel: is this heading at the top-most level in the file?
single_file: is there only one content file in the production (like some Poetry volumes)?
OUTPUTS:
a qualified ToCItem object
"""
toc_item = TocItem()
toc_item.division = get_book_division(node)
# is_top_level stops the first heading in a file getting an anchor id, we don't generally want that.
# The exceptions are things like poems within a single-file volume.
toc_item.toc_id = get_parent_id(node) # pylint: disable=invalid-name
if toc_item.toc_id == "":
toc_item.file_link = textf
else:
if not is_toplevel:
toc_item.file_link = f"{textf}#{toc_item.toc_id}"
elif single_file: # It IS the first heading in the file, but there's only a single content file?
toc_item.file_link = f"{textf}#{toc_item.toc_id}"
else:
toc_item.file_link = textf
toc_item.lang = node.get_attr("xml:lang")
if node.get_attr("hidden"):
toc_item.hidden = True
epub_type = node.get_attr("epub:type")
# it may be an empty header tag eg <h3>, so we pass its parent rather than itself to evaluate the parent's descendants
if not epub_type and node.tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
parent = node.parent
if parent:
evaluate_descendants(parent, toc_item, textf)
else: # shouldn't ever happen, but... just in case, raise an error
raise se.InvalidInputException(f"Heading without parent in file: [path][link=file://{textf}]{textf}[/][/].")
return toc_item
if epub_type:
# A heading may include z3998:roman directly,
# eg <h5 epub:type="title z3998:roman">II</h5>.
if "z3998:roman" in epub_type:
toc_item.roman = extract_strings(node)
try:
roman.fromRoman(toc_item.roman)
except roman.InvalidRomanNumeralError as err:
raise se.InvalidInputException(f"Heading tagged as roman numeral is invalid: {toc_item.roman} in [path][link=file://{textf}]{textf}[/][/].") from err
toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
return toc_item
if "ordinal" in epub_type: # but not a roman numeral (eg in Nietzche's Beyond Good and Evil)
toc_item.title = extract_strings(node)
toc_item.title_is_ordinal = True
return toc_item
# may be the halftitle page with a subtitle, so we need to burrow down
if ("fulltitle" in epub_type) and (node.tag == "hgroup"):
evaluate_descendants(node, toc_item, textf)
return toc_item
# or it may be a straightforward one-level title eg: <h2 epub:type="title">Imprint</h2>
if "title" in epub_type:
toc_item.title = extract_strings(node)
return toc_item
# otherwise, burrow down into its structure to get the info
evaluate_descendants(node, toc_item, textf)
return toc_item
def get_child_strings(node: EasyXmlElement) -> str:
"""
Get child strings
"""
children = node.xpath("./*")
child_strs = ""
for child in children:
child_strs += child.to_string() + "\n"
return child_strs
def evaluate_descendants(node: EasyXmlElement, toc_item: TocItem, textf: str) -> TocItem:
"""
Burrow down into a hgroup structure to qualify the ToC item
INPUTS:
node: EasyXmlElement object representing a hgroup
OUTPUTS:
toc_item: qualified ToC item
"""
children = node.xpath("./p | ./h1 | ./h2 | ./h3 | ./h4 | ./h5 | ./h6")
for child in children: # we expect these to be h1, h2, h3, h4 etc
epub_type = child.get_attr("epub:type")
if child.get_attr("hidden"):
toc_item.hidden = True
if not epub_type:
# should be a label/ordinal grouping
child_strings = get_child_strings(child)
if "label" in child_strings and "ordinal" in child_strings: # quick test
toc_item.title_is_ordinal = True
# strip label
child_strings = regex.sub(r"<span epub:type=\"label\">(.*?)</span>", " \\1 ", child_strings)
# remove ordinal if it's by itself in a span
child_strings = regex.sub(r"<span epub:type=\"ordinal\">(.*?)</span>", " \\1 ", child_strings)
# remove ordinal if it's joined with a roman (which we want to keep)
child_strings = regex.sub(r"\bordinal\b", "", child_strings)
# remove extra spaces
child_strings = regex.sub(r"[ ]{2,}", " ", child_strings)
# remove any carriage returns
child_strings = regex.sub(r"\n", "", child_strings)
# get rid of any endnotes
child_strings = strip_notes(child_strings)
toc_item.title = child_strings.strip()
continue # skip the following
if "z3998:roman" in epub_type:
toc_item.roman = extract_strings(child)
try:
roman.fromRoman(toc_item.roman)
except roman.InvalidRomanNumeralError as err:
raise se.InvalidInputException(f"Heading tagged as roman numeral is invalid: {toc_item.roman} in [path][link=file://{textf}]{textf}[/][/].") from err
if not toc_item.title:
toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
elif "ordinal" in epub_type: # but not a roman numeral or a labelled item, cases caught caught above
if not toc_item.title:
toc_item.title = extract_strings(child)
toc_item.title_is_ordinal = True
if "subtitle" in epub_type:
toc_item.subtitle = extract_strings(child)
else:
if "title" in epub_type: # this allows for `fulltitle` to work here, too
if toc_item.title or toc_item.roman or toc_item.title_is_ordinal: # if title already filled, must be a subtitle
toc_item.subtitle = extract_strings(child)
if toc_item.roman or toc_item.title_is_ordinal: # in these cases, we want to check language on subtitle
toc_item.lang = child.get_attr("xml:lang")
else:
toc_item.title = extract_strings(child)
if not toc_item.lang:
toc_item.lang = child.get_attr("xml:lang")
if toc_item.title and toc_item.subtitle: # then we're done, get out of loop by returning
return toc_item
return toc_item
def get_book_division(node: EasyXmlElement) -> BookDivision:
"""
Determine the kind of book division. At present only Part and Division
are important; but others stored for possible future logic.
INPUTS:
tag: an EasyXml node representing a tag
OUTPUTS:
a BookDivision enum value representing the kind of division
"""
parent_sections = node.xpath("./ancestor::*[name() = 'section' or name() = 'article']")
if not parent_sections:
parent_sections = node.xpath("./ancestor::body")
if not parent_sections: # couldn't find a parent, so throw an error
raise se.InvalidInputException
section_epub_type = parent_sections[-1].get_attr("epub:type")
retval = BookDivision.NONE
if not section_epub_type:
return retval
if "part" in section_epub_type:
retval = BookDivision.PART
if "division" in section_epub_type:
retval = BookDivision.DIVISION
if ("volume" in section_epub_type) and ("se:short-story" not in section_epub_type):
retval = BookDivision.VOLUME
if "subchapter" in section_epub_type:
retval = BookDivision.SUBCHAPTER
if "chapter" in section_epub_type:
retval = BookDivision.CHAPTER
if "article" in parent_sections[-1].tag:
retval = BookDivision.ARTICLE
return retval
def strip_notes(text: str) -> str:
"""
Returns html text stripped of noterefs.
INPUTS:
text: html which may include noterefs
OUTPUTS:
cleaned html string
"""
return regex.sub(r"""<a[^>]*?epub:type="noteref"[^>]*?>.*?<\/a>""", "", text)
def process_all_content(self, file_list: list) -> Tuple[list, list]:
"""
Analyze the whole content of the project, build and return lists
if toc_items and landmarks.
INPUTS:
file_list: a list of all content files
text_path: the path to the contents folder (src/epub/text)
OUTPUTS:
a tuple containing the list of Toc items and the list of landmark items
"""
toc_list: List[TocItem] = []
landmarks: List[TocItem] = []
# We make two passes through the work, because we need to know
# how many bodymatter items there are. So we do landmarks first.
for textf in file_list:
try:
dom = self.get_dom(textf)
except Exception as ex:
raise se.InvalidFileException(f"Couldn’t open file: [path][link=file://{textf}]{textf}[/][/]. Exception: {ex}") from ex
add_landmark(dom, textf.name, landmarks)
# Now we test to see if there is only one body item
body_items = [item for item in landmarks if item.place == Position.BODY]
single_file = len(body_items) == 1
single_file_without_headers = False
# If there's only one body item, does that item have a header?
if single_file:
single_file_without_headers = not body_items[0].has_headers
nest_under_halftitle = False
for textf in file_list:
with open(textf, "r", encoding="utf-8") as file:
dom = se.easy_xml.EasyXmlTree(file.read())
process_headings(dom, textf.name, toc_list, single_file, single_file_without_headers)
# Only consider half title pages that are front matter. Some books, like C.S. Lewis Poetry, may have half titles that are bodymatter
if dom.xpath("/html/body//*[contains(@epub:type, 'halftitlepage') and ancestor-or-self::*[contains(@epub:type, 'frontmatter')]]"):
nest_under_halftitle = True
# now go through adjusting for nesting under halftitle
if nest_under_halftitle:
# tricky because a few books have forewords, etc AFTER the halftitle, so have to know if we've passed it
passed_halftitle = False
for toc_item in toc_list:
if toc_item.place == Position.BODY:
toc_item.level += 1
if passed_halftitle and toc_item.place == Position.FRONT:
toc_item.level += 1
if "halftitle" in toc_item.file_link:
passed_halftitle = True
# We add this dummy item because outputtoc always needs to look ahead to the next item.
last_toc = TocItem()
last_toc.level = 1
last_toc.title = "dummy"
toc_list.append(last_toc)
return landmarks, toc_list
def generate_toc(self) -> str:
"""
Entry point for `SeEpub.generate_toc()`.
"""
work_title = self.get_work_title()
landmarks, toc_list = process_all_content(self, self.spine_file_paths)
return output_toc(toc_list, landmarks, self.toc_path, work_title)