diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..8348ebb 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -30,7 +30,7 @@ # values in inches INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -42,6 +42,7 @@ def get_filename_from_url(url): return os.path.basename(urlparse(url).path) + def is_url(url): """ Not to be used for actually validating a url, but in our use case we only @@ -50,6 +51,7 @@ def is_url(url): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def fetch_image(url): """ Attempts to fetch an image from a url. @@ -64,9 +66,11 @@ def fetch_image(url): except urllib.error.URLError: return None + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -132,12 +136,14 @@ def remove_whitespace(string, leading=False, trailing=False): # TODO need some way to get rid of extra spaces in e.g. text text return re.sub(r'\s+', ' ', string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 p = paragraph._element p.getparent().remove(p) p._p = p._element = None + font_styles = { 'b': 'bold', 'strong': 'bold', @@ -160,8 +166,8 @@ def delete_paragraph(paragraph): 'LIST_NUMBER': 'List Number', } -class HtmlToDocx(HTMLParser): +class HtmlToDocx(HTMLParser): def __init__(self): super().__init__() self.options = { @@ -171,9 +177,7 @@ def __init__(self): 'styles': True, } self.table_row_selectors = [ - 'table > tr', - 'table > thead > tr', - 'table > tbody > tr', + 'table > tr', 'table > thead > tr', 'table > tbody > tr', 'table > tfoot > tr' ] self.table_style = DEFAULT_TABLE_STYLE @@ -188,9 +192,10 @@ def set_initial_attrs(self, document=None): self.doc = document else: self.doc = Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options[ + 'fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc - self.include_tables = True #TODO add this option back in? + self.include_tables = True #TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None @@ -223,7 +228,8 @@ def add_styles_to_paragraph(self, style): units = re.sub(r'[0-9]+', '', margin) margin = int(float(re.sub(r'[a-z]+', '', margin))) if units == 'px': - self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + self.paragraph.paragraph_format.left_indent = Inches( + min(margin // 10 * INDENT, MAX_INDENT)) # TODO handle non px units def add_styles_to_run(self, style): @@ -232,26 +238,27 @@ def add_styles_to_run(self, style): color = re.sub(r'[a-z()]+', '', style['color']) colors = [int(x) for x in color.split(',')] elif '#' in style['color']: - color = style['color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + color = style['color'].replace('\n', '').lstrip('#') + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing self.run.font.color.rgb = RGBColor(*colors) - + if 'background-color' in style: if 'rgb' in style['background-color']: - color = color = re.sub(r'[a-z()]+', '', style['background-color']) + color = color = re.sub(r'[a-z()]+', '', + style['background-color']) colors = [int(x) for x in color.split(',')] elif '#' in style['background-color']: color = style['background-color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing - self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors def apply_paragraph_style(self, style=None): try: @@ -260,7 +267,8 @@ def apply_paragraph_style(self, style=None): elif self.paragraph_style: self.paragraph.style = self.paragraph_style except KeyError as e: - raise ValueError(f"Unable to apply style {self.paragraph_style}.") from e + raise ValueError( + f"Unable to apply style {self.paragraph_style}.") from e def parse_dict_string(self, string, separator=';'): new_string = string.replace(" ", '').split(separator) @@ -273,15 +281,16 @@ def handle_li(self): if list_depth: list_type = self.tags['list'][-1] else: - list_type = 'ul' # assign unordered if no tag + list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = styles['LIST_NUMBER'] else: list_style = styles['LIST_BULLET'] - self.paragraph = self.doc.add_paragraph(style=list_style) - self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) + self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph.paragraph_format.left_indent = Inches( + min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 def add_image_to_cell(self, cell, image): @@ -319,7 +328,8 @@ def handle_img(self, current_attrs): self.doc.add_paragraph("" % src) else: # avoid exposing filepaths in document - self.doc.add_paragraph("" % get_filename_from_url(src)) + self.doc.add_paragraph("" % + get_filename_from_url(src)) # add styles? def handle_table(self): @@ -331,14 +341,15 @@ def handle_table(self): Tell HTMLParser to ignore any tags until the corresponding closing table tag """ table_soup = self.tables[self.table_no] - rows, cols = self.get_table_dimensions(table_soup) - self.table = self.doc.add_table(rows, cols) + rows_dim, cols_dim = self.get_table_dimensions(table_soup) + self.table = self.doc.add_table(rows_dim, cols_dim) if self.table_style: try: self.table.style = self.table_style except KeyError as e: - raise ValueError(f"Unable to apply style {self.table_style}.") from e + raise ValueError( + f"Unable to apply style {self.table_style}.") from e rows = self.get_table_rows(table_soup) cell_row = 0 @@ -355,7 +366,7 @@ def handle_table(self): child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 - + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -375,7 +386,6 @@ def handle_link(self, href, text): hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) - # Create sub-run subrun = self.paragraph.add_run() rPr = docx.oxml.shared.OxmlElement('w:rPr') @@ -417,7 +427,7 @@ def handle_starttag(self, tag, attrs): return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) - return # don't apply styles for now + return # don't apply styles for now elif tag == 'br': self.run.add_break() return @@ -438,15 +448,15 @@ def handle_starttag(self, tag, attrs): self.paragraph = self.doc.add_paragraph() pPr = self.paragraph._p.get_or_add_pPr() pBdr = OxmlElement('w:pBdr') - pPr.insert_element_before(pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + pPr.insert_element_before( + pBdr, 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', + 'w:wordWrap', 'w:overflowPunct', 'w:topLinePunct', + 'w:autoSpaceDE', 'w:autoSpaceDN', 'w:bidi', 'w:adjustRightInd', + 'w:snapToGrid', 'w:spacing', 'w:ind', 'w:contextualSpacing', + 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', - 'w:pPrChange' - ) + 'w:pPrChange') bottom = OxmlElement('w:bottom') bottom.set(qn('w:val'), 'single') bottom.set(qn('w:sz'), '6') @@ -527,7 +537,7 @@ def handle_data(self, data): # You cannot have interactive content in an A tag, this includes links # https://html.spec.whatwg.org/#interactive-content link = self.tags.get('a') - if link: + if link and 'href' in link.keys(): self.handle_link(link['href'], data) else: # If there's a link, dont put the data directly in the run @@ -568,7 +578,8 @@ def ignore_nested_tables(self, tables_soup): def get_table_rows(self, table_soup): # If there's a header, body, footer or direct child tr tags, add row dimensions from there - return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + return table_soup.select(', '.join(self.table_row_selectors), + recursive=False) def get_table_columns(self, row): # Get all columns for the specified row tag. @@ -579,16 +590,19 @@ def get_table_dimensions(self, table_soup): rows = self.get_table_rows(table_soup) # Table is either empty or has non-direct children between table and tr tags # Thus the row dimensions and column dimensions are assumed to be 0 - - cols = self.get_table_columns(rows[0]) if rows else [] - return len(rows), len(cols) + max_col_size = 0 + for r in range(0, len(rows)): + col_size = len(self.get_table_columns(rows[r])) + if max_col_size < col_size: + max_col_size = col_size + return len(rows), max_col_size def get_tables(self): if not hasattr(self, 'soup'): self.include_tables = False return # find other way to do it, or require this dependency? - self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) self.table_no = 0 def run_process(self, html): @@ -602,14 +616,18 @@ def run_process(self, html): def add_html_to_document(self, html, document): if not isinstance(html, str): raise ValueError('First argument needs to be a %s' % str) - elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.document.Document) + elif not isinstance(document, + docx.document.Document) and not isinstance( + document, docx.table._Cell): + raise ValueError('Second argument needs to be a %s' % + docx.document.Document) self.set_initial_attrs(document) self.run_process(html) def add_html_to_cell(self, html, cell): if not isinstance(cell, docx.table._Cell): - raise ValueError('Second argument needs to be a %s' % docx.table._Cell) + raise ValueError('Second argument needs to be a %s' % + docx.table._Cell) unwanted_paragraph = cell.paragraphs[0] if unwanted_paragraph.text == "": delete_paragraph(unwanted_paragraph) @@ -618,7 +636,7 @@ def add_html_to_cell(self, html, cell): # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: - self.doc.add_paragraph('') + self.doc.add_paragraph('') def parse_html_file(self, filename_html, filename_docx=None): with open(filename_html, 'r') as infile: @@ -629,23 +647,28 @@ def parse_html_file(self, filename_html, filename_docx=None): path, filename = os.path.split(filename_html) filename_docx = '%s/new_docx_file_%s' % (path, filename) self.doc.save('%s.docx' % filename_docx) - + def parse_html_string(self, html): self.set_initial_attrs() self.run_process(html) return self.doc -if __name__=='__main__': - - arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting') - arg_parser.add_argument('filename_html', help='The .html file to be parsed') + +if __name__ == '__main__': + + arg_parser = argparse.ArgumentParser( + description='Convert .html file into .docx file with formatting') + arg_parser.add_argument('filename_html', + help='The .html file to be parsed') + arg_parser.add_argument( + 'filename_docx', + nargs='?', + help= + 'The name of the .docx file to be saved. Default new_docx_file_[filename_html]', + default=None) arg_parser.add_argument( - 'filename_docx', - nargs='?', - help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', - default=None - ) - arg_parser.add_argument('--bs', action='store_true', + '--bs', + action='store_true', help='Attempt to fix html before parsing. Requires bs4. Default True') args = vars(arg_parser.parse_args())