From 30550d1dffdfd245a1ee6432af381093a2d13a37 Mon Sep 17 00:00:00 2001 From: Evi1ran Date: Sun, 10 Apr 2022 18:59:14 +0800 Subject: [PATCH] [Add Features ] Support: page break, etc. [Add Features ] Support: page break, centered images, centered tables, etc. --- htmldocx/h2d.py | 290 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 229 insertions(+), 61 deletions(-) diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..100b9c0 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -12,14 +12,22 @@ How to deal with block level style applied over table elements? e.g. text align """ +import base64 +import binascii +import http +import pathlib import re, argparse import io, os +import time import urllib.request +from typing import Optional, cast, Dict from urllib.parse import urlparse from html.parser import HTMLParser import docx, docx.table from docx import Document +from docx.image.exceptions import UnrecognizedImageError +from docx.image.image import Image from docx.shared import RGBColor, Pt, Inches from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement @@ -27,10 +35,18 @@ from bs4 import BeautifulSoup +USABLE_HEIGHT = Inches(8.1) +USABLE_WIDTH = Inches(5.8) +DEFAULT_DPI = 72 + +MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MiB + +RFC_2397_BASE64 = ";base64" + # values in inches INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -42,6 +58,7 @@ def get_filename_from_url(url): return os.path.basename(urlparse(url).path) + def is_url(url): """ Not to be used for actually validating a url, but in our use case we only @@ -50,23 +67,77 @@ def is_url(url): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) -def fetch_image(url): - """ - Attempts to fetch an image from a url. - If successful returns a bytes object, else returns None - :return: - """ - try: - with urllib.request.urlopen(url) as response: - # security flaw? - return io.BytesIO(response.read()) - except urllib.error.URLError: - return None +def make_image(data: Optional[bytes]) -> io.BytesIO: + image_buffer = None + if data: + image_buffer = io.BytesIO(data) + try: + Image.from_blob(image_buffer.getbuffer()) + except UnrecognizedImageError: + image_buffer = None + + if not image_buffer: + broken_img_path = pathlib.Path(__file__).parent / "image-broken.png" + image_buffer = io.BytesIO(broken_img_path.read_bytes()) + + return image_buffer + + +def load_external_image(src: str) -> Optional[bytes]: + data = None + retry = 3 + while retry and not data: + try: + with urllib.request.urlopen(src) as response: + size = response.getheader("Content-Length") + if size and int(size) > MAX_IMAGE_SIZE: + break + # Read up to MAX_IMAGE_SIZE when response does not contain + # the Content-Length header. The extra byte avoids an extra read to + # check whether the EOF was reached. + data = cast(bytes, response.read(MAX_IMAGE_SIZE + 1)) + except (ValueError, http.client.HTTPException, urllib.error.HTTPError): + # ValueError: Invalid URL or non-integer Content-Length. + # HTTPException: Server does not speak HTTP properly. + # HTTPError: Server could not perform request. + retry = 0 + except urllib.error.URLError: + # URLError: Transient network error, e.g. DNS request failed. + retry -= 1 + if retry: + time.sleep(1) + else: + if len(data) <= MAX_IMAGE_SIZE: + return data + return None + + +def load_inline_image(src: str) -> Optional[bytes]: + image_data = None + header_data = src.split(RFC_2397_BASE64 + ",", maxsplit=1) + if len(header_data) == 2: + data = header_data[1] + try: + image_data = base64.b64decode(data, validate=True) + except (binascii.Error, ValueError): + # binascii.Error: Character outside of base64 set. + # ValueError: Character outside of ASCII. + pass + return image_data + + +def load_image(src: str) -> io.BytesIO: + image_bytes = ( + load_inline_image(src) if src.startswith("data:") else load_external_image(src) + ) + return make_image(image_bytes) + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -132,12 +203,14 @@ def remove_whitespace(string, leading=False, trailing=False): # TODO need some way to get rid of extra spaces in e.g. text text return re.sub(r'\s+', ' ', string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 p = paragraph._element p.getparent().remove(p) p._p = p._element = None + font_styles = { 'b': 'bold', 'strong': 'bold', @@ -160,6 +233,7 @@ def delete_paragraph(paragraph): 'LIST_NUMBER': 'List Number', } + class HtmlToDocx(HTMLParser): def __init__(self): @@ -188,9 +262,9 @@ def set_initial_attrs(self, document=None): self.doc = document else: self.doc = Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc - self.include_tables = True #TODO add this option back in? + self.include_tables = True # TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None @@ -218,13 +292,44 @@ def add_styles_to_paragraph(self, style): self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT elif align == 'justify': self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY - if 'margin-left' in style: + if 'margin-left' in style and 'margin-right' in style: + margin_left = style['margin-left'] + margin_right = style['margin-right'] + if "auto" in margin_left and "auto" in margin_right: + self.paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif 'margin-left' in style: + margin = style['margin-left'] + units = re.sub(r'[0-9]+', '', margin) + margin_suffix = re.sub(r'[a-z]+', '', margin) + if len(margin_suffix) > 0: + margin = int(float(margin_suffix)) + if units == 'px': + self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + # TODO handle non px units + + def add_styles_to_table(self, style): + if 'text-align' in style: + align = style['text-align'] + if align == 'center': + self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif align == 'right': + self.table.alignment = WD_ALIGN_PARAGRAPH.RIGHT + elif align == 'justify': + self.table.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + if 'margin-left' in style and 'margin-right' in style: + margin_left = style['margin-left'] + margin_right = style['margin-right'] + if "auto" in margin_left and "auto" in margin_right: + self.table.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif 'margin-left' in style: margin = style['margin-left'] units = re.sub(r'[0-9]+', '', margin) - margin = int(float(re.sub(r'[a-z]+', '', margin))) - if units == 'px': - self.paragraph.paragraph_format.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) - # TODO handle non px units + margin_suffix = re.sub(r'[a-z]+', '', margin) + if len(margin_suffix) > 0: + margin = int(float(margin_suffix)) + if units == 'px': + self.table.left_indent = Inches(min(margin // 10 * INDENT, MAX_INDENT)) + # TODO handle non px units def add_styles_to_run(self, style): if 'color' in style: @@ -233,25 +338,25 @@ def add_styles_to_run(self, style): colors = [int(x) for x in color.split(',')] elif '#' in style['color']: color = style['color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing self.run.font.color.rgb = RGBColor(*colors) - + if 'background-color' in style: if 'rgb' in style['background-color']: color = color = re.sub(r'[a-z()]+', '', style['background-color']) colors = [int(x) for x in color.split(',')] elif '#' in style['background-color']: color = style['background-color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing - self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + self.run.font.highlight_color = WD_COLOR.GRAY_25 # TODO: map colors def apply_paragraph_style(self, style=None): try: @@ -273,14 +378,14 @@ def handle_li(self): if list_depth: list_type = self.tags['list'][-1] else: - list_type = 'ul' # assign unordered if no tag + list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = styles['LIST_NUMBER'] else: list_style = styles['LIST_BULLET'] - self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph = self.doc.add_paragraph(style=list_style) self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 @@ -295,21 +400,20 @@ def handle_img(self, current_attrs): self.skip = True self.skip_tag = 'img' return - src = current_attrs['src'] - # fetch image + src = current_attrs.get("src") src_is_url = is_url(src) - if src_is_url: - try: - image = fetch_image(src) - except urllib.error.URLError: - image = None - else: - image = src + height_attr = current_attrs.get("height") + width_attr = current_attrs.get("width") + height_px = int(height_attr) if height_attr else None + width_px = int(width_attr) if width_attr else None + + image = load_image(src) + size = image_size(image, width_px, height_px) # add image to doc if image: try: if isinstance(self.doc, docx.document.Document): - self.doc.add_picture(image) + self.doc.add_picture(image, **size) else: self.add_image_to_cell(self.doc, image) except FileNotFoundError: @@ -320,9 +424,8 @@ def handle_img(self, current_attrs): else: # avoid exposing filepaths in document self.doc.add_paragraph("" % get_filename_from_url(src)) - # add styles? - def handle_table(self): + def handle_table(self, current_attrs): """ To handle nested tables, we will parse tables manually as follows: Get table soup @@ -355,13 +458,22 @@ def handle_table(self): child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 - + + if 'style' in current_attrs and self.table: + style = self.parse_dict_string(current_attrs['style']) + self.add_styles_to_table(style) + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' self.skip = True self.table = None + def handle_div(self, current_attrs): + # handle page break + if 'style' in current_attrs and "page-break-after: always" in current_attrs['style']: + self.doc.add_page_break() + def handle_link(self, href, text): # Link requires a relationship is_external = href.startswith('http') @@ -375,7 +487,6 @@ def handle_link(self, href, text): hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) - # Create sub-run subrun = self.paragraph.add_run() rPr = docx.oxml.shared.OxmlElement('w:rPr') @@ -417,7 +528,7 @@ def handle_starttag(self, tag, attrs): return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) - return # don't apply styles for now + return # don't apply styles for now elif tag == 'br': self.run.add_break() return @@ -439,14 +550,14 @@ def handle_starttag(self, tag, attrs): pPr = self.paragraph._p.get_or_add_pPr() pBdr = OxmlElement('w:pBdr') pPr.insert_element_before(pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', - 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', - 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', - 'w:pPrChange' - ) + 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', + 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', + 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', + 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', + 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', + 'w:pPrChange' + ) bottom = OxmlElement('w:bottom') bottom.set(qn('w:val'), 'single') bottom.set(qn('w:sz'), '6') @@ -463,12 +574,15 @@ def handle_starttag(self, tag, attrs): elif tag == 'img': self.handle_img(current_attrs) - return + self.paragraph = self.doc.paragraphs[-1] elif tag == 'table': - self.handle_table() + self.handle_table(current_attrs) return + elif tag == "div": + self.handle_div(current_attrs) + # set new run reference point in case of leading line breaks if tag in ['p', 'li', 'pre']: self.run = self.paragraph.add_run() @@ -588,7 +702,7 @@ def get_tables(self): self.include_tables = False return # find other way to do it, or require this dependency? - self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) self.table_no = 0 def run_process(self, html): @@ -618,7 +732,7 @@ def add_html_to_cell(self, html, cell): # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: - self.doc.add_paragraph('') + self.doc.add_paragraph('') def parse_html_file(self, filename_html, filename_docx=None): with open(filename_html, 'r') as infile: @@ -629,26 +743,80 @@ def parse_html_file(self, filename_html, filename_docx=None): path, filename = os.path.split(filename_html) filename_docx = '%s/new_docx_file_%s' % (path, filename) self.doc.save('%s.docx' % filename_docx) - + def parse_html_string(self, html): self.set_initial_attrs() self.run_process(html) return self.doc -if __name__=='__main__': - + +if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting') arg_parser.add_argument('filename_html', help='The .html file to be parsed') arg_parser.add_argument( - 'filename_docx', - nargs='?', - help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', + 'filename_docx', + nargs='?', + help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', default=None ) - arg_parser.add_argument('--bs', action='store_true', - help='Attempt to fix html before parsing. Requires bs4. Default True') + arg_parser.add_argument('--bs', action='store_true', + help='Attempt to fix html before parsing. Requires bs4. Default True') args = vars(arg_parser.parse_args()) file_html = args.pop('filename_html') html_parser = HtmlToDocx() html_parser.parse_html_file(file_html, **args) + + +def image_size( + image_buffer: io.BytesIO, + width_px: Optional[int] = None, + height_px: Optional[int] = None, +) -> Dict[str, int]: + """ + Compute width and height to feed python-docx so that image is contained in the page + and respects width_px and height_px. + Return: + Empty: No resize + Single dimension (width or height): image ratio is expected to be maintained + Two dimensions (width and height): image should be resized to dimensions + """ + image = Image.from_blob(image_buffer.getbuffer()) + + # Normalize image size to inches. + # - Without a specified pixel size, images are their actual pixel size, so that + # images of the same pixel size appear the same size in the document, regardless + # of their resolution. + # - With a specified pixel size, images should take the specified size, regardless + # of their resolution. + if height_px is None: + height = image.px_height / image.vert_dpi + else: + height = height_px / DEFAULT_DPI + if width_px is None: + width = image.px_width / image.horz_dpi + else: + width = width_px / DEFAULT_DPI + + height = Inches(height) + width = Inches(width) + + size = {} + if width > USABLE_WIDTH: + new_height = round(image.px_height / (image.px_width / USABLE_WIDTH)) + if new_height > USABLE_HEIGHT: + size["height"] = USABLE_HEIGHT + else: + size["width"] = USABLE_WIDTH + elif height > USABLE_HEIGHT: + new_width = round(image.px_width / (image.px_height / USABLE_HEIGHT)) + if new_width > USABLE_WIDTH: + size["width"] = USABLE_WIDTH + else: + size["height"] = USABLE_HEIGHT + else: + if width_px is not None: + size["width"] = width + if height_px is not None: + size["height"] = height + return size