@@ -250,6 +250,33 @@ def handle_table(self):
250250 self .skip = True
251251 self .table = None
252252
253+ def handle_link (self , href , text ):
254+ # Link requires a relationship
255+ is_external = href .startswith ('http' )
256+ rel_id = self .paragraph .part .relate_to (
257+ href ,
258+ docx .opc .constants .RELATIONSHIP_TYPE .HYPERLINK ,
259+ is_external = is_external
260+ )
261+
262+ # Create the w:hyperlink tag and add needed values
263+ hyperlink = docx .oxml .shared .OxmlElement ('w:hyperlink' )
264+ hyperlink .set (docx .oxml .shared .qn ('r:id' ), rel_id )
265+
266+
267+ # Create sub-run
268+ subrun = self .paragraph .add_run ()
269+ rPr = docx .oxml .shared .OxmlElement ('w:rPr' )
270+ rPr .style = 'Hyperlink'
271+ subrun ._r .append (rPr )
272+ subrun ._r .text = text
273+
274+ # Add subrun to hyperlink
275+ hyperlink .append (subrun ._r )
276+
277+ # Add hyperlink to run
278+ self .paragraph ._p .append (hyperlink )
279+
253280 def handle_starttag (self , tag , attrs ):
254281 if self .skip :
255282 return
@@ -276,10 +303,10 @@ def handle_starttag(self, tag, attrs):
276303 self .tags [tag ] = current_attrs
277304 if tag == 'p' :
278305 self .paragraph = self .doc .add_paragraph ()
279-
306+
280307 elif tag == 'li' :
281308 self .handle_li ()
282-
309+
283310 elif tag [0 ] == 'h' and len (tag ) == 2 :
284311 if isinstance (self .doc , docx .document .Document ):
285312 h_size = int (tag [1 ])
@@ -290,15 +317,15 @@ def handle_starttag(self, tag, attrs):
290317 elif tag == 'img' :
291318 self .handle_img (current_attrs )
292319 return
293-
320+
294321 elif tag == 'table' :
295322 self .handle_table ()
296323 return
297-
324+
298325 # set new run reference point in case of leading line breaks
299326 if tag == 'p' or tag == 'li' :
300327 self .run = self .paragraph .add_run ()
301-
328+
302329 # add style
303330 if not self .include_styles :
304331 return
@@ -310,27 +337,22 @@ def handle_endtag(self, tag):
310337 if self .skip :
311338 if not tag == self .skip_tag :
312339 return
313-
340+
314341 if self .instances_to_skip > 0 :
315342 self .instances_to_skip -= 1
316343 return
317344
318345 self .skip = False
319346 self .skip_tag = None
320347 self .paragraph = None
321-
348+
322349 if tag == 'span' :
323350 if self .tags ['span' ]:
324351 self .tags ['span' ].pop ()
325352 return
326353 elif tag == 'ol' or tag == 'ul' :
327354 remove_last_occurence (self .tags ['list' ], tag )
328355 return
329- elif tag == 'a' :
330- link = self .tags .pop (tag )
331- href = link ['href' ]
332- self .paragraph .add_run ('<link: %s>' % href )
333- return
334356 elif tag == 'table' :
335357 self .table_no += 1
336358 self .table = None
@@ -348,18 +370,27 @@ def handle_data(self, data):
348370 if not self .paragraph :
349371 self .paragraph = self .doc .add_paragraph ()
350372
351- self .run = self .paragraph .add_run (data )
352- spans = self .tags ['span' ]
353- for span in spans :
354- if 'style' in span :
355- style = self .parse_dict_string (span ['style' ])
356- self .add_styles_to_run (style )
357-
358- # add font style
359- for tag in self .tags :
360- if tag in fonts :
361- font_style = fonts [tag ]
362- setattr (self .run .font , font_style , True )
373+ # There can only be one nested link in a valid html document
374+ # You cannot have interactive content in an A tag, this includes links
375+ # https://html.spec.whatwg.org/#interactive-content
376+ link = self .tags .get ('a' )
377+ if link :
378+ self .handle_link (link ['href' ], data )
379+ else :
380+ # If there's a link, dont put the data directly in the run
381+ self .run = self .paragraph .add_run (data )
382+ spans = self .tags ['span' ]
383+ for span in spans :
384+ if 'style' in span :
385+ style = self .parse_dict_string (span ['style' ])
386+ self .add_styles_to_run (style )
387+
388+
389+ # add font style
390+ for tag in self .tags :
391+ if tag in fonts :
392+ font_style = fonts [tag ]
393+ setattr (self .run .font , font_style , True )
363394
364395 def ignore_nested_tables (self , tables_soup ):
365396 """
0 commit comments