From a7218c316388f47f7a6126d087788fdcbe664224 Mon Sep 17 00:00:00 2001
From: rogelio-basurto <77854803+rogelio-basurto@users.noreply.github.com>
Date: Thu, 9 Dec 2021 06:02:19 -0600
Subject: [PATCH] Update h2d.py
Used a formatter: yapf
Also, fixed a bug in the links section. If does not had an 'href', it had problems.
Also, changed the table creation, so checks the max number of columns, when you have merged elements.
Added a fix in the color elements in case it has some trash (I got an html with \n all over).
---
htmldocx/h2d.py | 129 ++++++++++++++++++++++++++++--------------------
1 file changed, 76 insertions(+), 53 deletions(-)
diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
index 6a0e113..8348ebb 100644
--- a/htmldocx/h2d.py
+++ b/htmldocx/h2d.py
@@ -30,7 +30,7 @@
# values in inches
INDENT = 0.25
LIST_INDENT = 0.5
-MAX_INDENT = 5.5 # To stop indents going off the page
+MAX_INDENT = 5.5 # To stop indents going off the page
# Style to use with tables. By default no style is used.
DEFAULT_TABLE_STYLE = None
@@ -42,6 +42,7 @@
def get_filename_from_url(url):
return os.path.basename(urlparse(url).path)
+
def is_url(url):
"""
Not to be used for actually validating a url, but in our use case we only
@@ -50,6 +51,7 @@ def is_url(url):
parts = urlparse(url)
return all([parts.scheme, parts.netloc, parts.path])
+
def fetch_image(url):
"""
Attempts to fetch an image from a url.
@@ -64,9 +66,11 @@ def fetch_image(url):
except urllib.error.URLError:
return None
+
def remove_last_occurence(ls, x):
ls.pop(len(ls) - ls[::-1].index(x) - 1)
+
def remove_whitespace(string, leading=False, trailing=False):
"""Remove white space from a string.
@@ -132,12 +136,14 @@ def remove_whitespace(string, leading=False, trailing=False):
# TODO need some way to get rid of extra spaces in e.g. text text
return re.sub(r'\s+', ' ', string)
+
def delete_paragraph(paragraph):
# https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
+
font_styles = {
'b': 'bold',
'strong': 'bold',
@@ -160,8 +166,8 @@ def delete_paragraph(paragraph):
'LIST_NUMBER': 'List Number',
}
-class HtmlToDocx(HTMLParser):
+class HtmlToDocx(HTMLParser):
def __init__(self):
super().__init__()
self.options = {
@@ -171,9 +177,7 @@ def __init__(self):
'styles': True,
}
self.table_row_selectors = [
- 'table > tr',
- 'table > thead > tr',
- 'table > tbody > tr',
+ 'table > tr', 'table > thead > tr', 'table > tbody > tr',
'table > tfoot > tr'
]
self.table_style = DEFAULT_TABLE_STYLE
@@ -188,9 +192,10 @@ def set_initial_attrs(self, document=None):
self.doc = document
else:
self.doc = Document()
- self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
+ self.bs = self.options[
+ 'fix-html'] # whether or not to clean with BeautifulSoup
self.document = self.doc
- self.include_tables = True #TODO add this option back in?
+ self.include_tables = True #TODO add this option back in?
self.include_images = self.options['images']
self.include_styles = self.options['styles']
self.paragraph = None
@@ -223,7 +228,8 @@ def add_styles_to_paragraph(self, style):
units = re.sub(r'[0-9]+', '', margin)
margin = int(float(re.sub(r'[a-z]+', '', margin)))
if units == 'px':
- self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT))
+ self.paragraph.paragraph_format.left_indent = Inches(
+ min(margin // 10 * INDENT, MAX_INDENT))
# TODO handle non px units
def add_styles_to_run(self, style):
@@ -232,26 +238,27 @@ def add_styles_to_run(self, style):
color = re.sub(r'[a-z()]+', '', style['color'])
colors = [int(x) for x in color.split(',')]
elif '#' in style['color']:
- color = style['color'].lstrip('#')
- colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ color = style['color'].replace('\n', '').lstrip('#')
+ colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
self.run.font.color.rgb = RGBColor(*colors)
-
+
if 'background-color' in style:
if 'rgb' in style['background-color']:
- color = color = re.sub(r'[a-z()]+', '', style['background-color'])
+ color = color = re.sub(r'[a-z()]+', '',
+ style['background-color'])
colors = [int(x) for x in color.split(',')]
elif '#' in style['background-color']:
color = style['background-color'].lstrip('#')
- colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
- self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
+ self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
def apply_paragraph_style(self, style=None):
try:
@@ -260,7 +267,8 @@ def apply_paragraph_style(self, style=None):
elif self.paragraph_style:
self.paragraph.style = self.paragraph_style
except KeyError as e:
- raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e
+ raise ValueError(
+ f"Unable to apply style {self.paragraph_style}.") from e
def parse_dict_string(self, string, separator=';'):
new_string = string.replace(" ", '').split(separator)
@@ -273,15 +281,16 @@ def handle_li(self):
if list_depth:
list_type = self.tags['list'][-1]
else:
- list_type = 'ul' # assign unordered if no tag
+ list_type = 'ul' # assign unordered if no tag
if list_type == 'ol':
list_style = styles['LIST_NUMBER']
else:
list_style = styles['LIST_BULLET']
- self.paragraph = self.doc.add_paragraph(style=list_style)
- self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT))
+ self.paragraph = self.doc.add_paragraph(style=list_style)
+ self.paragraph.paragraph_format.left_indent = Inches(
+ min(list_depth * LIST_INDENT, MAX_INDENT))
self.paragraph.paragraph_format.line_spacing = 1
def add_image_to_cell(self, cell, image):
@@ -319,7 +328,8 @@ def handle_img(self, current_attrs):
self.doc.add_paragraph("" % src)
else:
# avoid exposing filepaths in document
- self.doc.add_paragraph("" % get_filename_from_url(src))
+ self.doc.add_paragraph("" %
+ get_filename_from_url(src))
# add styles?
def handle_table(self):
@@ -331,14 +341,15 @@ def handle_table(self):
Tell HTMLParser to ignore any tags until the corresponding closing table tag
"""
table_soup = self.tables[self.table_no]
- rows, cols = self.get_table_dimensions(table_soup)
- self.table = self.doc.add_table(rows, cols)
+ rows_dim, cols_dim = self.get_table_dimensions(table_soup)
+ self.table = self.doc.add_table(rows_dim, cols_dim)
if self.table_style:
try:
self.table.style = self.table_style
except KeyError as e:
- raise ValueError(f"Unable to apply style {self.table_style}.") from e
+ raise ValueError(
+ f"Unable to apply style {self.table_style}.") from e
rows = self.get_table_rows(table_soup)
cell_row = 0
@@ -355,7 +366,7 @@ def handle_table(self):
child_parser.add_html_to_cell(cell_html, docx_cell)
cell_col += 1
cell_row += 1
-
+
# skip all tags until corresponding closing tag
self.instances_to_skip = len(table_soup.find_all('table'))
self.skip_tag = 'table'
@@ -375,7 +386,6 @@ def handle_link(self, href, text):
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
-
# Create sub-run
subrun = self.paragraph.add_run()
rPr = docx.oxml.shared.OxmlElement('w:rPr')
@@ -417,7 +427,7 @@ def handle_starttag(self, tag, attrs):
return
elif tag == 'ol' or tag == 'ul':
self.tags['list'].append(tag)
- return # don't apply styles for now
+ return # don't apply styles for now
elif tag == 'br':
self.run.add_break()
return
@@ -438,15 +448,15 @@ def handle_starttag(self, tag, attrs):
self.paragraph = self.doc.add_paragraph()
pPr = self.paragraph._p.get_or_add_pPr()
pBdr = OxmlElement('w:pBdr')
- pPr.insert_element_before(pBdr,
- 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
- 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
- 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
- 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
+ pPr.insert_element_before(
+ pBdr, 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku',
+ 'w:wordWrap', 'w:overflowPunct', 'w:topLinePunct',
+ 'w:autoSpaceDE', 'w:autoSpaceDN', 'w:bidi', 'w:adjustRightInd',
+ 'w:snapToGrid', 'w:spacing', 'w:ind', 'w:contextualSpacing',
+ 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
- 'w:pPrChange'
- )
+ 'w:pPrChange')
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
@@ -527,7 +537,7 @@ def handle_data(self, data):
# You cannot have interactive content in an A tag, this includes links
# https://html.spec.whatwg.org/#interactive-content
link = self.tags.get('a')
- if link:
+ if link and 'href' in link.keys():
self.handle_link(link['href'], data)
else:
# If there's a link, dont put the data directly in the run
@@ -568,7 +578,8 @@ def ignore_nested_tables(self, tables_soup):
def get_table_rows(self, table_soup):
# If there's a header, body, footer or direct child tr tags, add row dimensions from there
- return table_soup.select(', '.join(self.table_row_selectors), recursive=False)
+ return table_soup.select(', '.join(self.table_row_selectors),
+ recursive=False)
def get_table_columns(self, row):
# Get all columns for the specified row tag.
@@ -579,16 +590,19 @@ def get_table_dimensions(self, table_soup):
rows = self.get_table_rows(table_soup)
# Table is either empty or has non-direct children between table and tr tags
# Thus the row dimensions and column dimensions are assumed to be 0
-
- cols = self.get_table_columns(rows[0]) if rows else []
- return len(rows), len(cols)
+ max_col_size = 0
+ for r in range(0, len(rows)):
+ col_size = len(self.get_table_columns(rows[r]))
+ if max_col_size < col_size:
+ max_col_size = col_size
+ return len(rows), max_col_size
def get_tables(self):
if not hasattr(self, 'soup'):
self.include_tables = False
return
# find other way to do it, or require this dependency?
- self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
+ self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
self.table_no = 0
def run_process(self, html):
@@ -602,14 +616,18 @@ def run_process(self, html):
def add_html_to_document(self, html, document):
if not isinstance(html, str):
raise ValueError('First argument needs to be a %s' % str)
- elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.document.Document)
+ elif not isinstance(document,
+ docx.document.Document) and not isinstance(
+ document, docx.table._Cell):
+ raise ValueError('Second argument needs to be a %s' %
+ docx.document.Document)
self.set_initial_attrs(document)
self.run_process(html)
def add_html_to_cell(self, html, cell):
if not isinstance(cell, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.table._Cell)
+ raise ValueError('Second argument needs to be a %s' %
+ docx.table._Cell)
unwanted_paragraph = cell.paragraphs[0]
if unwanted_paragraph.text == "":
delete_paragraph(unwanted_paragraph)
@@ -618,7 +636,7 @@ def add_html_to_cell(self, html, cell):
# cells must end with a paragraph or will get message about corrupt file
# https://stackoverflow.com/a/29287121
if not self.doc.paragraphs:
- self.doc.add_paragraph('')
+ self.doc.add_paragraph('')
def parse_html_file(self, filename_html, filename_docx=None):
with open(filename_html, 'r') as infile:
@@ -629,23 +647,28 @@ def parse_html_file(self, filename_html, filename_docx=None):
path, filename = os.path.split(filename_html)
filename_docx = '%s/new_docx_file_%s' % (path, filename)
self.doc.save('%s.docx' % filename_docx)
-
+
def parse_html_string(self, html):
self.set_initial_attrs()
self.run_process(html)
return self.doc
-if __name__=='__main__':
-
- arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting')
- arg_parser.add_argument('filename_html', help='The .html file to be parsed')
+
+if __name__ == '__main__':
+
+ arg_parser = argparse.ArgumentParser(
+ description='Convert .html file into .docx file with formatting')
+ arg_parser.add_argument('filename_html',
+ help='The .html file to be parsed')
+ arg_parser.add_argument(
+ 'filename_docx',
+ nargs='?',
+ help=
+ 'The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
+ default=None)
arg_parser.add_argument(
- 'filename_docx',
- nargs='?',
- help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
- default=None
- )
- arg_parser.add_argument('--bs', action='store_true',
+ '--bs',
+ action='store_true',
help='Attempt to fix html before parsing. Requires bs4. Default True')
args = vars(arg_parser.parse_args())