Skip to content

Commit c699ed6

Browse files
author
Ava Thorn
committed
Merge branch 'issue_6' into parse_html_string
2 parents 79dc4bd + 40bab4b commit c699ed6

File tree

2 files changed

+144
-8
lines changed

2 files changed

+144
-8
lines changed

htmldocx/h2d.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ def __init__(self):
9090
'tables': True,
9191
'styles': True,
9292
}
93+
self.table_row_selectors = [
94+
'table > tr',
95+
'table > thead > tr',
96+
'table > tbody > tr',
97+
'table > tfoot > tr'
98+
]
9399

94100
def set_initial_attrs(self, document=None):
95101
self.tags = {
@@ -112,9 +118,9 @@ def set_initial_attrs(self, document=None):
112118

113119
def get_cell_html(self, soup):
114120
# Returns string of td element with opening and closing <td> tags removed
115-
if soup.find_all():
116-
return '\n'.join(str(soup).split('\n')[1:-1])
117-
return str(soup)[4:-5]
121+
# Cannot use find_all as it only finds element tags and does not find text which
122+
# is not inside an element
123+
return ' '.join([str(i) for i in soup.contents])
118124

119125
def add_styles_to_paragraph(self, style):
120126
if 'text-align' in style:
@@ -223,10 +229,10 @@ def handle_table(self):
223229
table_soup = self.tables[self.table_no]
224230
rows, cols = self.get_table_dimensions(table_soup)
225231
self.table = self.doc.add_table(rows, cols)
226-
rows = table_soup.find_all('tr', recursive=False)
232+
rows = self.get_table_rows(table_soup)
227233
cell_row = 0
228234
for row in rows:
229-
cols = row.find_all(['th', 'td'], recursive=False)
235+
cols = self.get_table_columns(table_soup, row)
230236
cell_col = 0
231237
for col in cols:
232238
cell_html = self.get_cell_html(col)
@@ -372,10 +378,21 @@ def ignore_nested_tables(self, tables_soup):
372378
new_tables.append(table)
373379
nest = len(table.find_all('table'))
374380
return new_tables
375-
381+
382+
def get_table_rows(self, table_soup):
383+
# If there's a header, body, footer or direct child tr tags, add row dimensions from there
384+
return table_soup.select(', '.join(self.table_row_selectors), recursive=False)
385+
386+
def get_table_columns(self, table_soup, row):
387+
# Get all columns for the specified row tag.
388+
return row.find_all(['th', 'td'], recursive=False) if row else []
389+
376390
def get_table_dimensions(self, table_soup):
377-
rows = table_soup.find_all('tr', recursive=False)
378-
cols = rows[0].find_all(['th', 'td'], recursive=False)
391+
# Get rows for the table
392+
rows = self.get_table_rows(table_soup)
393+
# Table is either empty or has non-direct children between table and tr tags
394+
# Thus the row dimensions and column dimensions are assumed to be 0
395+
cols = self.get_table_columns(table_soup, rows[0]) if rows else []
379396
return len(rows), len(cols)
380397

381398
def get_tables(self):

tests/tables1.html

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,125 @@ <h1>Hi</h1>
1717
<td>cell 6</td>
1818
</tr>
1919
</table>
20+
<p>Here is one that uses tbody and thead</p>
21+
<table>
22+
<thead>
23+
<tr>
24+
<th>heading 1</th>
25+
<th>th 2</th>
26+
<th>th 3</th>
27+
</tr>
28+
</thead>
29+
<tbody>
30+
<tr>
31+
<td>cell 1</td>
32+
<td>cell 2</td>
33+
<td>cell 3</td>
34+
</tr>
35+
<tr>
36+
<td><em>a special</em>cell 4</td>
37+
<td>cell 5</td>
38+
<td>cell 6</td>
39+
</tr>
40+
</tbody>
41+
</table>
42+
<p>Here is one that uses only tbody and not thead</p>
43+
<table>
44+
<tr>
45+
<th>heading 1</th>
46+
<th>th 2</th>
47+
<th>th 3</th>
48+
</tr>
49+
<tbody>
50+
<tr>
51+
<td>cell 1</td>
52+
<td>cell 2</td>
53+
<td>cell 3</td>
54+
</tr>
55+
<tr>
56+
<td><em>a special</em>cell 4</td>
57+
<td>cell 5</td>
58+
<td>cell 6</td>
59+
</tr>
60+
</tbody>
61+
</table>
62+
<p>Here is one that uses only thead and not tbody</p>
63+
<table>
64+
<thead>
65+
<tr>
66+
<th>heading 1</th>
67+
<th>th 2</th>
68+
<th>th 3</th>
69+
</tr>
70+
</thead>
71+
<tr>
72+
<td>cell 1</td>
73+
<td>cell 2</td>
74+
<td>cell 3</td>
75+
</tr>
76+
<tr>
77+
<td><em>a special</em>cell 4</td>
78+
<td>cell 5</td>
79+
<td>cell 6</td>
80+
</tr>
81+
</table>
82+
<p>Here is one that uses only thead and tfoot</p>
83+
<table>
84+
<thead>
85+
<tr>
86+
<th>heading 1</th>
87+
<th>th 2</th>
88+
<th>th 3</th>
89+
</tr>
90+
</thead>
91+
<tfoot>
92+
<tr>
93+
<td>cell 1</td>
94+
<td>cell 2</td>
95+
<td>cell 3</td>
96+
</tr>
97+
<tr>
98+
<td><em>a special</em>cell 4</td>
99+
<td>cell 5</td>
100+
<td>cell 6</td>
101+
</tr>
102+
</tfoot>
103+
</table>
104+
<p>Here is one that uses only tfoot and inner trs</p>
105+
<table>
106+
<tr>
107+
<th>heading 1</th>
108+
<th>th 2</th>
109+
<th>th 3</th>
110+
</tr>
111+
<tfoot>
112+
<tr>
113+
<td>cell 1</td>
114+
<td>cell 2</td>
115+
<td>cell 3</td>
116+
</tr>
117+
<tr>
118+
<td><em>a special</em>cell 4</td>
119+
<td>cell 5</td>
120+
<td>cell 6</td>
121+
</tr>
122+
</tfoot>
123+
</table>
124+
<p>Here is one that uses only tfoot</p>
125+
<table>
126+
<tfoot>
127+
<tr>
128+
<td>cell 1</td>
129+
<td>cell 2</td>
130+
<td>cell 3</td>
131+
</tr>
132+
<tr>
133+
<td><em>a special</em>cell 4</td>
134+
<td>cell 5</td>
135+
<td>cell 6</td>
136+
</tr>
137+
</tfoot>
138+
</table>
20139

21140
Here is another table. This one has formatting.
22141
<s>

0 commit comments

Comments
 (0)