Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
*.pyc

venv
/.idea
41 changes: 35 additions & 6 deletions htmldocx/h2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def add_styles_to_run(self, style):
if 'color' in style:
if 'rgb' in style['color']:
color = re.sub(r'[a-z()]+', '', style['color'])
colors = [int(x) for x in color.split(',')]
colors = [int(x) for x in color.split(',')[:3]] # 原来处理color: rgba(38, 42, 51, 0.9); 时,有后面的0.9透明度就会报错,现在只截取前3个
elif '#' in style['color']:
color = style['color'].lstrip('#')
colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
Expand Down Expand Up @@ -309,7 +309,13 @@ def handle_img(self, current_attrs):
if image:
try:
if isinstance(self.doc, docx.document.Document):
self.doc.add_picture(image)
width = current_attrs.get('width')
height = current_attrs.get('height')
self.doc.add_picture(
image_path_or_stream=image,
width=Inches(int(width) / 72) if width else None, # 72 is the default dpi
height=Inches(int(height) / 72) if height else None
)
else:
self.add_image_to_cell(self.doc, image)
except FileNotFoundError:
Expand Down Expand Up @@ -346,16 +352,29 @@ def handle_table(self):
cols = self.get_table_columns(row)
cell_col = 0
for col in cols:
colspan = int(col.attrs.get('colspan', 1))
rowspan = int(col.attrs.get('rowspan', 1))

cell_html = self.get_cell_html(col)
if col.name == 'th':
cell_html = "<b>%s</b>" % cell_html

docx_cell = self.table.cell(cell_row, cell_col)
while docx_cell.text != '': # Skip the merged cell
cell_col += 1
docx_cell = self.table.cell(cell_row, cell_col)

cell_to_merge = self.table.cell(cell_row + rowspan - 1, cell_col + colspan - 1)
if docx_cell != cell_to_merge:
docx_cell.merge(cell_to_merge)

child_parser = HtmlToDocx()
child_parser.copy_settings_from(self)
child_parser.add_html_to_cell(cell_html, docx_cell)
cell_col += 1
child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position

cell_col += colspan
cell_row += 1

# skip all tags until corresponding closing tag
self.instances_to_skip = len(table_soup.find_all('table'))
self.skip_tag = 'table'
Expand Down Expand Up @@ -581,7 +600,13 @@ def get_table_dimensions(self, table_soup):
# Thus the row dimensions and column dimensions are assumed to be 0

cols = self.get_table_columns(rows[0]) if rows else []
return len(rows), len(cols)
# Add colspan calculation column number
col_count = 0
for col in cols:
colspan = col.attrs.get('colspan', 1)
col_count += int(colspan)

return len(rows), col_count

def get_tables(self):
if not hasattr(self, 'soup'):
Expand All @@ -597,16 +622,20 @@ def run_process(self, html):
html = str(self.soup)
if self.include_tables:
self.get_tables()

self.feed(html)


def add_html_to_document(self, html, document):
if not isinstance(html, str):
raise ValueError('First argument needs to be a %s' % str)
elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
raise ValueError('Second argument needs to be a %s' % docx.document.Document)

self.set_initial_attrs(document)
self.run_process(html)


def add_html_to_cell(self, html, cell):
if not isinstance(cell, docx.table._Cell):
raise ValueError('Second argument needs to be a %s' % docx.table._Cell)
Expand Down
22 changes: 22 additions & 0 deletions tests/tables3.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<table border="1">
<tr>
<td rowspan="2" colspan="2">aa</td>
<td>bb</td>
</tr>
<tr>
<td>cc</td>
</tr>
<tr>
<td>dd</td>
<td colspan="2">ee</td>
</tr>
<tr>
<td rowspan="2">ff</td>
<td>gg</td>
<td>hh</td>
</tr>
<tr>
<td>ii</td>
<td>jj</td>
</tr>
</table>
9 changes: 9 additions & 0 deletions tests/test_tables_cell_merging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os
from .context import HtmlToDocx, test_dir

# Manual test (requires inspection of result) for converting html with nested tables

filename = os.path.join(test_dir, 'tables3.html')
d = HtmlToDocx()

d.parse_html_file(filename)