Skip to content

Commit

Permalink
travail 08/13 11:54
Browse files Browse the repository at this point in the history
  • Loading branch information
deborah-powers committed Aug 13, 2024
1 parent 4de518d commit 31985bd
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 29 deletions.
2 changes: 0 additions & 2 deletions fanfic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@ def __init__ (self, url, subject=None):
elif '</article>' in self.text and self.text.count ('</article>') ==1: self.text = htmlCls.getByTag (self.text, 'article', False)
self.meta ={ 'link': self.link, 'author': self.author, 'autlink': self.autlink, 'subject': self.subject }
self.delId()
"""
article = self.toText()
if article: article.divide()
else: self.divide()
"""

def findSubject (self):
if self.subject:
Expand Down
58 changes: 31 additions & 27 deletions htmlCls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ def delId (self):
for child in self.children: child.delId()
self.toInnerHtml()

def delScript (self):
tags = self.getAllByTag ('script')
for tag in tags: self.delete (tag)
tags = self.getAllByTag ('style')
for tag in tags: self.delete (tag)

def delete (self, childToDel):
if len (self.children) ==0: return None
elif childToDel in self.children:
Expand All @@ -62,8 +68,6 @@ def delete (self, childToDel):
res = self.children[c].delete (childToDel)
c+=1
return res


# ________________________ récupérer les noeuds d'intérêt ________________________

def getOneByTag (self, tagName):
Expand Down Expand Up @@ -209,7 +213,7 @@ def setChildren (self):
if f<0: f= self.lenght()
self.children.append (HtmlTag (self.innerHtml[d:f]))
else: d= self.innerHtml.find ('<', d+1)
# éliminer les emboîtements inutiles
# éliminer les emboîtements inutiles
if len (self.children) ==1:
if self.children[0].tag == 'text': self.children =[]
elif self.tag == 'a':
Expand All @@ -231,7 +235,6 @@ def unnestOneChild (self):
for child in self.children[0].children: self.children.append (child)
self.children.pop (0)


# ________________________ manipulations basiques ________________________

def lenght (self):
Expand All @@ -253,6 +256,7 @@ def __lt__ (self, other):
else: return False

def toInnerHtml (self):
if not self.children: return
self.innerHtml =""
for child in self.children: self.innerHtml = self.innerHtml + child.__str__()

Expand Down Expand Up @@ -347,6 +351,7 @@ def setFromTag (self, node):
if node != None:
self.tree = node
self.text = node.innerHtml
self.tree.tag = 'body'

# ________________________ finir la lecture, préparer l'écriture ________________________

Expand All @@ -356,20 +361,18 @@ def setHtml (self):
self.tree = HtmlTag (self.text[d:f])
self.text = self.tree.innerHtml

def set (self):
d= self.text.find ('<body')
f=7+ self.text.rfind ('</body>')
self.tree = HtmlTag (self.text[d:f])
self.text = self.tree.innerHtml

def setTitle (self):
if '</title>' in self.text: self.title = cleanTitle (self.getOneByTag ('title').innerHtml)
elif '</h1>' in self.text: self.title = cleanTitle (self.getOneByTag ('h1').innerHtml)

def setMain (self):
if '</main>' in self.text: self.setByTag ('main')
if self.text.count ('</article>') ==1: self.setByTag ('article')
# if self.text.count ('</section>') ==1: self.setByTag ('section')

def setMetas (self):
metaList = self.tree.getAllByTag ('meta')
self.meta ={}
# log.logMsg (metaList)
for meta in metaList:
attributes = meta.attributes.keys()
if 'content' in attributes and 'name' in attributes and meta.attributes['name'][:5] != 'csrf-':
Expand Down Expand Up @@ -402,21 +405,25 @@ def toText (self):
def read (self):
File.read (self)
self.cleanBody()
self.setHtml()
self.setTitle()
self.setMetas()
self.setFromTag ('body')
self.tree.tag = 'body'
# self.delAttributes()

def addIndentation (self):
self.replace ('\n'," ")
self.replace ('\t'," ")
while " " in self.text: self.replace (" "," ")
self.replace ("> ",'>')
self.replace (" <",'<')
# rajouter les espaces autour des balises internes
self.replace ("<a ", " <a ")
self.replace ("> <a ", "><a ")
for tag in listTagsIntern[:-1]:
self.replace ('<'+ tag, ' <'+ tag)
self.replace ('> <'+ tag, '><'+ tag)
tagPoint = '<.:;'
for tag in listTagsIntern:
self.replace ('</'+ tag +'>', '</'+ tag +'> ')
for point in tagPoint: self.replace ('</'+ tag +'> '+ point, '</'+ tag +'>'+ point)
# rajouter les sauts de ligne
self.replace ('><', '>\n<')
# self.replace ('>\n</', '></')
for tag in listTagsIntern:
self.replace ('\n<' + tag, '<'+ tag)
self.replace ('</' + tag + '>\n', '</' + tag +'>')
Expand All @@ -425,7 +432,6 @@ def write (self, mode='w'):
# self.text ne contient plus que le corps du body
self.addIndentation()
self.meta['link'] = self.link
self.cleanBody()
self.title = cleanTitle (self.title)
self.text = templateHtml % (self.title, self.getMetas(), self.text)
File.write (self, mode)
Expand Down Expand Up @@ -471,13 +477,7 @@ def fromUrl (self, params=None):
res = self.fromUrlVa()
if not res: res = self.fromUrlVb()
self.path = pathTmp
if res:
self.cleanBody()
self.setTitle()
self.setMetas()
self.setFromTag ('body')
self.tree.tag = 'body'
# self.delAttributes()
if res: self.cleanBody()
else: print ('la récupération à échoué, impossible de récupérer les données')

""" ________________________ nettoyer le texte ________________________ """
Expand Down Expand Up @@ -513,4 +513,8 @@ def cleanBody (self):
for tag in listTagsSelfClosing:
self.replace ('<'+ tag.upper(), '<'+ tag)
self.replace (tag +'>', tag +'/>')
self.delScript()
self.setHtml()
self.setTitle()
self.setMetas()
self.setByTag ('body')
# self.delScript()

0 comments on commit 31985bd

Please sign in to comment.