diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
index 6a0e113..8348ebb 100644
--- a/htmldocx/h2d.py
+++ b/htmldocx/h2d.py
@@ -30,7 +30,7 @@
# values in inches
INDENT = 0.25
LIST_INDENT = 0.5
-MAX_INDENT = 5.5 # To stop indents going off the page
+MAX_INDENT = 5.5 # To stop indents going off the page
# Style to use with tables. By default no style is used.
DEFAULT_TABLE_STYLE = None
@@ -42,6 +42,7 @@
def get_filename_from_url(url):
return os.path.basename(urlparse(url).path)
+
def is_url(url):
"""
Not to be used for actually validating a url, but in our use case we only
@@ -50,6 +51,7 @@ def is_url(url):
parts = urlparse(url)
return all([parts.scheme, parts.netloc, parts.path])
+
def fetch_image(url):
"""
Attempts to fetch an image from a url.
@@ -64,9 +66,11 @@ def fetch_image(url):
except urllib.error.URLError:
return None
+
def remove_last_occurence(ls, x):
ls.pop(len(ls) - ls[::-1].index(x) - 1)
+
def remove_whitespace(string, leading=False, trailing=False):
"""Remove white space from a string.
@@ -132,12 +136,14 @@ def remove_whitespace(string, leading=False, trailing=False):
# TODO need some way to get rid of extra spaces in e.g. text text
return re.sub(r'\s+', ' ', string)
+
def delete_paragraph(paragraph):
# https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
+
font_styles = {
'b': 'bold',
'strong': 'bold',
@@ -160,8 +166,8 @@ def delete_paragraph(paragraph):
'LIST_NUMBER': 'List Number',
}
-class HtmlToDocx(HTMLParser):
+class HtmlToDocx(HTMLParser):
def __init__(self):
super().__init__()
self.options = {
@@ -171,9 +177,7 @@ def __init__(self):
'styles': True,
}
self.table_row_selectors = [
- 'table > tr',
- 'table > thead > tr',
- 'table > tbody > tr',
+ 'table > tr', 'table > thead > tr', 'table > tbody > tr',
'table > tfoot > tr'
]
self.table_style = DEFAULT_TABLE_STYLE
@@ -188,9 +192,10 @@ def set_initial_attrs(self, document=None):
self.doc = document
else:
self.doc = Document()
- self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
+ self.bs = self.options[
+ 'fix-html'] # whether or not to clean with BeautifulSoup
self.document = self.doc
- self.include_tables = True #TODO add this option back in?
+ self.include_tables = True #TODO add this option back in?
self.include_images = self.options['images']
self.include_styles = self.options['styles']
self.paragraph = None
@@ -223,7 +228,8 @@ def add_styles_to_paragraph(self, style):
units = re.sub(r'[0-9]+', '', margin)
margin = int(float(re.sub(r'[a-z]+', '', margin)))
if units == 'px':
- self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT))
+ self.paragraph.paragraph_format.left_indent = Inches(
+ min(margin // 10 * INDENT, MAX_INDENT))
# TODO handle non px units
def add_styles_to_run(self, style):
@@ -232,26 +238,27 @@ def add_styles_to_run(self, style):
color = re.sub(r'[a-z()]+', '', style['color'])
colors = [int(x) for x in color.split(',')]
elif '#' in style['color']:
- color = style['color'].lstrip('#')
- colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ color = style['color'].replace('\n', '').lstrip('#')
+ colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
self.run.font.color.rgb = RGBColor(*colors)
-
+
if 'background-color' in style:
if 'rgb' in style['background-color']:
- color = color = re.sub(r'[a-z()]+', '', style['background-color'])
+ color = color = re.sub(r'[a-z()]+', '',
+ style['background-color'])
colors = [int(x) for x in color.split(',')]
elif '#' in style['background-color']:
color = style['background-color'].lstrip('#')
- colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
+ colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
else:
colors = [0, 0, 0]
# TODO map colors to named colors (and extended colors...)
# For now set color to black to prevent crashing
- self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
+ self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors
def apply_paragraph_style(self, style=None):
try:
@@ -260,7 +267,8 @@ def apply_paragraph_style(self, style=None):
elif self.paragraph_style:
self.paragraph.style = self.paragraph_style
except KeyError as e:
- raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e
+ raise ValueError(
+ f"Unable to apply style {self.paragraph_style}.") from e
def parse_dict_string(self, string, separator=';'):
new_string = string.replace(" ", '').split(separator)
@@ -273,15 +281,16 @@ def handle_li(self):
if list_depth:
list_type = self.tags['list'][-1]
else:
- list_type = 'ul' # assign unordered if no tag
+ list_type = 'ul' # assign unordered if no tag
if list_type == 'ol':
list_style = styles['LIST_NUMBER']
else:
list_style = styles['LIST_BULLET']
- self.paragraph = self.doc.add_paragraph(style=list_style)
- self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT))
+ self.paragraph = self.doc.add_paragraph(style=list_style)
+ self.paragraph.paragraph_format.left_indent = Inches(
+ min(list_depth * LIST_INDENT, MAX_INDENT))
self.paragraph.paragraph_format.line_spacing = 1
def add_image_to_cell(self, cell, image):
@@ -319,7 +328,8 @@ def handle_img(self, current_attrs):
self.doc.add_paragraph("" % src)
else:
# avoid exposing filepaths in document
- self.doc.add_paragraph("" % get_filename_from_url(src))
+ self.doc.add_paragraph("" %
+ get_filename_from_url(src))
# add styles?
def handle_table(self):
@@ -331,14 +341,15 @@ def handle_table(self):
Tell HTMLParser to ignore any tags until the corresponding closing table tag
"""
table_soup = self.tables[self.table_no]
- rows, cols = self.get_table_dimensions(table_soup)
- self.table = self.doc.add_table(rows, cols)
+ rows_dim, cols_dim = self.get_table_dimensions(table_soup)
+ self.table = self.doc.add_table(rows_dim, cols_dim)
if self.table_style:
try:
self.table.style = self.table_style
except KeyError as e:
- raise ValueError(f"Unable to apply style {self.table_style}.") from e
+ raise ValueError(
+ f"Unable to apply style {self.table_style}.") from e
rows = self.get_table_rows(table_soup)
cell_row = 0
@@ -355,7 +366,7 @@ def handle_table(self):
child_parser.add_html_to_cell(cell_html, docx_cell)
cell_col += 1
cell_row += 1
-
+
# skip all tags until corresponding closing tag
self.instances_to_skip = len(table_soup.find_all('table'))
self.skip_tag = 'table'
@@ -375,7 +386,6 @@ def handle_link(self, href, text):
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
-
# Create sub-run
subrun = self.paragraph.add_run()
rPr = docx.oxml.shared.OxmlElement('w:rPr')
@@ -417,7 +427,7 @@ def handle_starttag(self, tag, attrs):
return
elif tag == 'ol' or tag == 'ul':
self.tags['list'].append(tag)
- return # don't apply styles for now
+ return # don't apply styles for now
elif tag == 'br':
self.run.add_break()
return
@@ -438,15 +448,15 @@ def handle_starttag(self, tag, attrs):
self.paragraph = self.doc.add_paragraph()
pPr = self.paragraph._p.get_or_add_pPr()
pBdr = OxmlElement('w:pBdr')
- pPr.insert_element_before(pBdr,
- 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
- 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
- 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
- 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
+ pPr.insert_element_before(
+ pBdr, 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku',
+ 'w:wordWrap', 'w:overflowPunct', 'w:topLinePunct',
+ 'w:autoSpaceDE', 'w:autoSpaceDN', 'w:bidi', 'w:adjustRightInd',
+ 'w:snapToGrid', 'w:spacing', 'w:ind', 'w:contextualSpacing',
+ 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
- 'w:pPrChange'
- )
+ 'w:pPrChange')
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
@@ -527,7 +537,7 @@ def handle_data(self, data):
# You cannot have interactive content in an A tag, this includes links
# https://html.spec.whatwg.org/#interactive-content
link = self.tags.get('a')
- if link:
+ if link and 'href' in link.keys():
self.handle_link(link['href'], data)
else:
# If there's a link, dont put the data directly in the run
@@ -568,7 +578,8 @@ def ignore_nested_tables(self, tables_soup):
def get_table_rows(self, table_soup):
# If there's a header, body, footer or direct child tr tags, add row dimensions from there
- return table_soup.select(', '.join(self.table_row_selectors), recursive=False)
+ return table_soup.select(', '.join(self.table_row_selectors),
+ recursive=False)
def get_table_columns(self, row):
# Get all columns for the specified row tag.
@@ -579,16 +590,19 @@ def get_table_dimensions(self, table_soup):
rows = self.get_table_rows(table_soup)
# Table is either empty or has non-direct children between table and tr tags
# Thus the row dimensions and column dimensions are assumed to be 0
-
- cols = self.get_table_columns(rows[0]) if rows else []
- return len(rows), len(cols)
+ max_col_size = 0
+ for r in range(0, len(rows)):
+ col_size = len(self.get_table_columns(rows[r]))
+ if max_col_size < col_size:
+ max_col_size = col_size
+ return len(rows), max_col_size
def get_tables(self):
if not hasattr(self, 'soup'):
self.include_tables = False
return
# find other way to do it, or require this dependency?
- self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
+ self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
self.table_no = 0
def run_process(self, html):
@@ -602,14 +616,18 @@ def run_process(self, html):
def add_html_to_document(self, html, document):
if not isinstance(html, str):
raise ValueError('First argument needs to be a %s' % str)
- elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.document.Document)
+ elif not isinstance(document,
+ docx.document.Document) and not isinstance(
+ document, docx.table._Cell):
+ raise ValueError('Second argument needs to be a %s' %
+ docx.document.Document)
self.set_initial_attrs(document)
self.run_process(html)
def add_html_to_cell(self, html, cell):
if not isinstance(cell, docx.table._Cell):
- raise ValueError('Second argument needs to be a %s' % docx.table._Cell)
+ raise ValueError('Second argument needs to be a %s' %
+ docx.table._Cell)
unwanted_paragraph = cell.paragraphs[0]
if unwanted_paragraph.text == "":
delete_paragraph(unwanted_paragraph)
@@ -618,7 +636,7 @@ def add_html_to_cell(self, html, cell):
# cells must end with a paragraph or will get message about corrupt file
# https://stackoverflow.com/a/29287121
if not self.doc.paragraphs:
- self.doc.add_paragraph('')
+ self.doc.add_paragraph('')
def parse_html_file(self, filename_html, filename_docx=None):
with open(filename_html, 'r') as infile:
@@ -629,23 +647,28 @@ def parse_html_file(self, filename_html, filename_docx=None):
path, filename = os.path.split(filename_html)
filename_docx = '%s/new_docx_file_%s' % (path, filename)
self.doc.save('%s.docx' % filename_docx)
-
+
def parse_html_string(self, html):
self.set_initial_attrs()
self.run_process(html)
return self.doc
-if __name__=='__main__':
-
- arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting')
- arg_parser.add_argument('filename_html', help='The .html file to be parsed')
+
+if __name__ == '__main__':
+
+ arg_parser = argparse.ArgumentParser(
+ description='Convert .html file into .docx file with formatting')
+ arg_parser.add_argument('filename_html',
+ help='The .html file to be parsed')
+ arg_parser.add_argument(
+ 'filename_docx',
+ nargs='?',
+ help=
+ 'The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
+ default=None)
arg_parser.add_argument(
- 'filename_docx',
- nargs='?',
- help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
- default=None
- )
- arg_parser.add_argument('--bs', action='store_true',
+ '--bs',
+ action='store_true',
help='Attempt to fix html before parsing. Requires bs4. Default True')
args = vars(arg_parser.parse_args())