Skip to content

Commit a460c32

Browse files
author
Ava Thorn
committed
fixed hyperlink bug
1 parent c699ed6 commit a460c32

File tree

1 file changed

+55
-24
lines changed

1 file changed

+55
-24
lines changed

htmldocx/h2d.py

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,33 @@ def handle_table(self):
250250
self.skip = True
251251
self.table = None
252252

253+
def handle_link(self, href, text):
254+
# Link requires a relationship
255+
is_external = href.startswith('http')
256+
rel_id = self.paragraph.part.relate_to(
257+
href,
258+
docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK,
259+
is_external=is_external
260+
)
261+
262+
# Create the w:hyperlink tag and add needed values
263+
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
264+
hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
265+
266+
267+
# Create sub-run
268+
subrun = self.paragraph.add_run()
269+
rPr = docx.oxml.shared.OxmlElement('w:rPr')
270+
rPr.style = 'Hyperlink'
271+
subrun._r.append(rPr)
272+
subrun._r.text = text
273+
274+
# Add subrun to hyperlink
275+
hyperlink.append(subrun._r)
276+
277+
# Add hyperlink to run
278+
self.paragraph._p.append(hyperlink)
279+
253280
def handle_starttag(self, tag, attrs):
254281
if self.skip:
255282
return
@@ -276,10 +303,10 @@ def handle_starttag(self, tag, attrs):
276303
self.tags[tag] = current_attrs
277304
if tag == 'p':
278305
self.paragraph = self.doc.add_paragraph()
279-
306+
280307
elif tag == 'li':
281308
self.handle_li()
282-
309+
283310
elif tag[0] == 'h' and len(tag) == 2:
284311
if isinstance(self.doc, docx.document.Document):
285312
h_size = int(tag[1])
@@ -290,15 +317,15 @@ def handle_starttag(self, tag, attrs):
290317
elif tag == 'img':
291318
self.handle_img(current_attrs)
292319
return
293-
320+
294321
elif tag == 'table':
295322
self.handle_table()
296323
return
297-
324+
298325
# set new run reference point in case of leading line breaks
299326
if tag == 'p' or tag == 'li':
300327
self.run = self.paragraph.add_run()
301-
328+
302329
# add style
303330
if not self.include_styles:
304331
return
@@ -310,27 +337,22 @@ def handle_endtag(self, tag):
310337
if self.skip:
311338
if not tag == self.skip_tag:
312339
return
313-
340+
314341
if self.instances_to_skip > 0:
315342
self.instances_to_skip -= 1
316343
return
317344

318345
self.skip = False
319346
self.skip_tag = None
320347
self.paragraph = None
321-
348+
322349
if tag == 'span':
323350
if self.tags['span']:
324351
self.tags['span'].pop()
325352
return
326353
elif tag == 'ol' or tag == 'ul':
327354
remove_last_occurence(self.tags['list'], tag)
328355
return
329-
elif tag == 'a':
330-
link = self.tags.pop(tag)
331-
href = link['href']
332-
self.paragraph.add_run('<link: %s>' % href)
333-
return
334356
elif tag == 'table':
335357
self.table_no += 1
336358
self.table = None
@@ -348,18 +370,27 @@ def handle_data(self, data):
348370
if not self.paragraph:
349371
self.paragraph = self.doc.add_paragraph()
350372

351-
self.run = self.paragraph.add_run(data)
352-
spans = self.tags['span']
353-
for span in spans:
354-
if 'style' in span:
355-
style = self.parse_dict_string(span['style'])
356-
self.add_styles_to_run(style)
357-
358-
# add font style
359-
for tag in self.tags:
360-
if tag in fonts:
361-
font_style = fonts[tag]
362-
setattr(self.run.font, font_style, True)
373+
# There can only be one nested link in a valid html document
374+
# You cannot have interactive content in an A tag, this includes links
375+
# https://html.spec.whatwg.org/#interactive-content
376+
link = self.tags.get('a')
377+
if link:
378+
self.handle_link(link['href'], data)
379+
else:
380+
# If there's a link, dont put the data directly in the run
381+
self.run = self.paragraph.add_run(data)
382+
spans = self.tags['span']
383+
for span in spans:
384+
if 'style' in span:
385+
style = self.parse_dict_string(span['style'])
386+
self.add_styles_to_run(style)
387+
388+
389+
# add font style
390+
for tag in self.tags:
391+
if tag in fonts:
392+
font_style = fonts[tag]
393+
setattr(self.run.font, font_style, True)
363394

364395
def ignore_nested_tables(self, tables_soup):
365396
"""

0 commit comments

Comments
 (0)