From a7218c316388f47f7a6126d087788fdcbe664224 Mon Sep 17 00:00:00 2001 From: rogelio-basurto <77854803+rogelio-basurto@users.noreply.github.com> Date: Thu, 9 Dec 2021 06:02:19 -0600 Subject: [PATCH] Update h2d.py Used a formatter: yapf Also, fixed a bug in the links section. If does not had an 'href', it had problems. Also, changed the table creation, so checks the max number of columns, when you have merged elements. Added a fix in the color elements in case it has some trash (I got an html with \n all over). --- htmldocx/h2d.py | 129 ++++++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 53 deletions(-) diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..8348ebb 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -30,7 +30,7 @@ # values in inches INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -42,6 +42,7 @@ def get_filename_from_url(url): return os.path.basename(urlparse(url).path) + def is_url(url): """ Not to be used for actually validating a url, but in our use case we only @@ -50,6 +51,7 @@ def is_url(url): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def fetch_image(url): """ Attempts to fetch an image from a url. @@ -64,9 +66,11 @@ def fetch_image(url): except urllib.error.URLError: return None + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -132,12 +136,14 @@ def remove_whitespace(string, leading=False, trailing=False): # TODO need some way to get rid of extra spaces in e.g. text text return re.sub(r'\s+', ' ', string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 p = paragraph._element p.getparent().remove(p) p._p = p._element = None + font_styles = { 'b': 'bold', 'strong': 'bold', @@ -160,8 +166,8 @@ def delete_paragraph(paragraph): 'LIST_NUMBER': 'List Number', } -class HtmlToDocx(HTMLParser): +class HtmlToDocx(HTMLParser): def __init__(self): super().__init__() self.options = { @@ -171,9 +177,7 @@ def __init__(self): 'styles': True, } self.table_row_selectors = [ - 'table > tr', - 'table > thead > tr', - 'table > tbody > tr', + 'table > tr', 'table > thead > tr', 'table > tbody > tr', 'table > tfoot > tr' ] self.table_style = DEFAULT_TABLE_STYLE @@ -188,9 +192,10 @@ def set_initial_attrs(self, document=None): self.doc = document else: self.doc = Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options[ + 'fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc - self.include_tables = True #TODO add this option back in? + self.include_tables = True #TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None @@ -223,7 +228,8 @@ def add_styles_to_paragraph(self, style): units = re.sub(r'[0-9]+', '', margin) margin = int(float(re.sub(r'[a-z]+', '', margin))) if units == 'px': - self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + self.paragraph.paragraph_format.left_indent = Inches( + min(margin // 10 * INDENT, MAX_INDENT)) # TODO handle non px units def add_styles_to_run(self, style): @@ -232,26 +238,27 @@ def add_styles_to_run(self, style): color = re.sub(r'[a-z()]+', '', style['color']) colors = [int(x) for x in color.split(',')] elif '#' in style['color']: - color = style['color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + color = style['color'].replace('\n', '').lstrip('#') + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing self.run.font.color.rgb = RGBColor(*colors) - + if 'background-color' in style: if 'rgb' in style['background-color']: - color = color = re.sub(r'[a-z()]+', '', style['background-color']) + color = color = re.sub(r'[a-z()]+', '', + style['background-color']) colors = [int(x) for x in color.split(',')] elif '#' in style['background-color']: color = style['background-color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing - self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors def apply_paragraph_style(self, style=None): try: @@ -260,7 +267,8 @@ def apply_paragraph_style(self, style=None): elif self.paragraph_style: self.paragraph.style = self.paragraph_style except KeyError as e: - raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e + raise ValueError( + f"Unable to apply style {self.paragraph_style}.") from e def parse_dict_string(self, string, separator=';'): new_string = string.replace(" ", '').split(separator) @@ -273,15 +281,16 @@ def handle_li(self): if list_depth: list_type = self.tags['list'][-1] else: - list_type = 'ul' # assign unordered if no tag + list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = styles['LIST_NUMBER'] else: list_style = styles['LIST_BULLET'] - self.paragraph = self.doc.add_paragraph(style=list_style) - self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) + self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph.paragraph_format.left_indent = Inches( + min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 def add_image_to_cell(self, cell, image): @@ -319,7 +328,8 @@ def handle_img(self, current_attrs): self.doc.add_paragraph("" % src) else: # avoid exposing filepaths in document - self.doc.add_paragraph("" % get_filename_from_url(src)) + self.doc.add_paragraph("" % + get_filename_from_url(src)) # add styles? def handle_table(self): @@ -331,14 +341,15 @@ def handle_table(self): Tell HTMLParser to ignore any tags until the corresponding closing table tag """ table_soup = self.tables[self.table_no] - rows, cols = self.get_table_dimensions(table_soup) - self.table = self.doc.add_table(rows, cols) + rows_dim, cols_dim = self.get_table_dimensions(table_soup) + self.table = self.doc.add_table(rows_dim, cols_dim) if self.table_style: try: self.table.style = self.table_style except KeyError as e: - raise ValueError(f"Unable to apply style {self.table_style}.") from e + raise ValueError( + f"Unable to apply style {self.table_style}.") from e rows = self.get_table_rows(table_soup) cell_row = 0 @@ -355,7 +366,7 @@ def handle_table(self): child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 - + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -375,7 +386,6 @@ def handle_link(self, href, text): hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) - # Create sub-run subrun = self.paragraph.add_run() rPr = docx.oxml.shared.OxmlElement('w:rPr') @@ -417,7 +427,7 @@ def handle_starttag(self, tag, attrs): return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) - return # don't apply styles for now + return # don't apply styles for now elif tag == 'br': self.run.add_break() return @@ -438,15 +448,15 @@ def handle_starttag(self, tag, attrs): self.paragraph = self.doc.add_paragraph() pPr = self.paragraph._p.get_or_add_pPr() pBdr = OxmlElement('w:pBdr') - pPr.insert_element_before(pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + pPr.insert_element_before( + pBdr, 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', + 'w:wordWrap', 'w:overflowPunct', 'w:topLinePunct', + 'w:autoSpaceDE', 'w:autoSpaceDN', 'w:bidi', 'w:adjustRightInd', + 'w:snapToGrid', 'w:spacing', 'w:ind', 'w:contextualSpacing', + 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', - 'w:pPrChange' - ) + 'w:pPrChange') bottom = OxmlElement('w:bottom') bottom.set(qn('w:val'), 'single') bottom.set(qn('w:sz'), '6') @@ -527,7 +537,7 @@ def handle_data(self, data): # You cannot have interactive content in an A tag, this includes links # https://html.spec.whatwg.org/#interactive-content link = self.tags.get('a') - if link: + if link and 'href' in link.keys(): self.handle_link(link['href'], data) else: # If there's a link, dont put the data directly in the run @@ -568,7 +578,8 @@ def ignore_nested_tables(self, tables_soup): def get_table_rows(self, table_soup): # If there's a header, body, footer or direct child tr tags, add row dimensions from there - return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + return table_soup.select(', '.join(self.table_row_selectors), + recursive=False) def get_table_columns(self, row): # Get all columns for the specified row tag. @@ -579,16 +590,19 @@ def get_table_dimensions(self, table_soup): rows = self.get_table_rows(table_soup) # Table is either empty or has non-direct children between table and tr tags # Thus the row dimensions and column dimensions are assumed to be 0 - - cols = self.get_table_columns(rows[0]) if rows else [] - return len(rows), len(cols) + max_col_size = 0 + for r in range(0, len(rows)): + col_size = len(self.get_table_columns(rows[r])) + if max_col_size < col_size: + max_col_size = col_size + return len(rows), max_col_size def get_tables(self): if not hasattr(self, 'soup'): self.include_tables = False return # find other way to do it, or require this dependency? - self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) self.table_no = 0 def run_process(self, html): @@ -602,14 +616,18 @@ def run_process(self, html): def add_html_to_document(self, html, document): if not isinstance(html, str): raise ValueError('First argument needs to be a %s' % str) - elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.document.Document) + elif not isinstance(document, + docx.document.Document) and not isinstance( + document, docx.table._Cell): + raise ValueError('Second argument needs to be a %s' % + docx.document.Document) self.set_initial_attrs(document) self.run_process(html) def add_html_to_cell(self, html, cell): if not isinstance(cell, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.table._Cell) + raise ValueError('Second argument needs to be a %s' % + docx.table._Cell) unwanted_paragraph = cell.paragraphs[0] if unwanted_paragraph.text == "": delete_paragraph(unwanted_paragraph) @@ -618,7 +636,7 @@ def add_html_to_cell(self, html, cell): # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: - self.doc.add_paragraph('') + self.doc.add_paragraph('') def parse_html_file(self, filename_html, filename_docx=None): with open(filename_html, 'r') as infile: @@ -629,23 +647,28 @@ def parse_html_file(self, filename_html, filename_docx=None): path, filename = os.path.split(filename_html) filename_docx = '%s/new_docx_file_%s' % (path, filename) self.doc.save('%s.docx' % filename_docx) - + def parse_html_string(self, html): self.set_initial_attrs() self.run_process(html) return self.doc -if __name__=='__main__': - - arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting') - arg_parser.add_argument('filename_html', help='The .html file to be parsed') + +if __name__ == '__main__': + + arg_parser = argparse.ArgumentParser( + description='Convert .html file into .docx file with formatting') + arg_parser.add_argument('filename_html', + help='The .html file to be parsed') + arg_parser.add_argument( + 'filename_docx', + nargs='?', + help= + 'The name of the .docx file to be saved. Default new_docx_file_[filename_html]', + default=None) arg_parser.add_argument( - 'filename_docx', - nargs='?', - help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', - default=None - ) - arg_parser.add_argument('--bs', action='store_true', + '--bs', + action='store_true', help='Attempt to fix html before parsing. Requires bs4. Default True') args = vars(arg_parser.parse_args())