Skip to content

Commit f823793

Browse files
Jon Grace-Coxjongracecox
authored andcommitted
Add handling for pre and code blocks
This commit adds handling for pre and code blocks. - Pre and code blocks use Courier font. - Pre blocks retain white space and new line characters. - Code blocks do not retain white space and new lines. Changes ======= - Add unittests for code and pre blocks, and inline code snippets. - Add manual test for code blocks. Run this manually to validate visual representation of code and pre-formatted blocks. - Add doctests for remove_whitespace function to help show exactly what the function does. - Run h2d doctests in python-app workflow - Run test_code module in python-app workflow - Update remove_whitespace to allow removing leading and trailing new lines wrapped with white space. This prevents spaces appearing before or after code blocks.
1 parent ad1dd21 commit f823793

File tree

5 files changed

+169
-11
lines changed

5 files changed

+169
-11
lines changed

.github/workflows/python-app.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ jobs:
3535
run: |
3636
python -m tests.test
3737
python -m tests.test_tables
38+
python -m tests.test_code
39+
python -m doctest -v htmldocx/h2d.py
3840
- name: Upload test artifacts
3941
uses: actions/[email protected]
4042
with:

htmldocx/h2d.py

Lines changed: 89 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,67 @@ def fetch_image(url):
5858
def remove_last_occurence(ls, x):
5959
ls.pop(len(ls) - ls[::-1].index(x) - 1)
6060

61-
def remove_whitespace(string):
61+
def remove_whitespace(string, leading=False, trailing=False):
62+
"""Remove white space from a string.
63+
64+
Args:
65+
string(str): The string to remove white space from.
66+
leading(bool, optional): Remove leading new lines when True.
67+
trailing(bool, optional): Remove trailing new lines when False.
68+
69+
Returns:
70+
str: The input string with new line characters removed and white space squashed.
71+
72+
Examples:
73+
74+
Single or multiple new line characters are replaced with space.
75+
76+
>>> remove_whitespace("abc\\ndef")
77+
'abc def'
78+
>>> remove_whitespace("abc\\n\\n\\ndef")
79+
'abc def'
80+
81+
New line characters surrounded by white space are replaced with a single space.
82+
83+
>>> remove_whitespace("abc \\n \\n \\n def")
84+
'abc def'
85+
>>> remove_whitespace("abc \\n \\n \\n def")
86+
'abc def'
87+
88+
Leading and trailing new lines are replaced with a single space.
89+
90+
>>> remove_whitespace("\\nabc")
91+
' abc'
92+
>>> remove_whitespace(" \\n abc")
93+
' abc'
94+
>>> remove_whitespace("abc\\n")
95+
'abc '
96+
>>> remove_whitespace("abc \\n ")
97+
'abc '
98+
99+
Use ``leading=True`` to remove leading new line characters, including any surrounding
100+
white space:
101+
102+
>>> remove_whitespace("\\nabc", leading=True)
103+
'abc'
104+
>>> remove_whitespace(" \\n abc", leading=True)
105+
'abc'
106+
107+
Use ``trailing=True`` to remove trailing new line characters, including any surrounding
108+
white space:
109+
110+
>>> remove_whitespace("abc \\n ", trailing=True)
111+
'abc'
112+
"""
113+
# Remove any leading new line characters along with any surrounding white space
114+
if leading:
115+
string = re.sub(r'^\s*\n+\s*', '', string)
116+
117+
# Remove any trailing new line characters along with any surrounding white space
118+
if trailing:
119+
string = re.sub(r'\s*\n+\s*$', '', string)
120+
121+
# Replace new line characters and absorb any surrounding space.
62122
string = re.sub(r'\s*\n\s*', ' ', string)
63123
return re.sub(r'>\s{2+}<', '><', string)
64124

@@ -68,7 +128,7 @@ def delete_paragraph(paragraph):
68128
p.getparent().remove(p)
69129
p._p = p._element = None
70130

71-
fonts = {
131+
font_styles = {
72132
'b': 'bold',
73133
'strong': 'bold',
74134
'em': 'italic',
@@ -80,6 +140,11 @@ def delete_paragraph(paragraph):
80140
'th': 'bold',
81141
}
82142

143+
font_names = {
144+
'code': 'Courier',
145+
'pre': 'Courier',
146+
}
147+
83148
class HtmlToDocx(HTMLParser):
84149

85150
def __init__(self):
@@ -309,7 +374,7 @@ def handle_starttag(self, tag, attrs):
309374
return
310375

311376
self.tags[tag] = current_attrs
312-
if tag == 'p':
377+
if tag in ['p', 'pre']:
313378
self.paragraph = self.doc.add_paragraph()
314379

315380
elif tag == 'li':
@@ -331,7 +396,7 @@ def handle_starttag(self, tag, attrs):
331396
return
332397

333398
# set new run reference point in case of leading line breaks
334-
if tag == 'p' or tag == 'li':
399+
if tag in ['p', 'li', 'pre']:
335400
self.run = self.paragraph.add_run()
336401

337402
# add style
@@ -375,6 +440,18 @@ def handle_data(self, data):
375440
if self.skip:
376441
return
377442

443+
# Only remove white space if we're not in a pre block.
444+
if 'pre' not in self.tags:
445+
446+
args = {}
447+
448+
# In a code block we want to strip leading and trailing new lines and white space.
449+
# Without this we would have a leading space in the code block.
450+
if 'code' in self.tags:
451+
args['leading'] = True
452+
args['trailing'] = True
453+
data = remove_whitespace(data, **args)
454+
378455
if not self.paragraph:
379456
self.paragraph = self.doc.add_paragraph()
380457

@@ -393,13 +470,16 @@ def handle_data(self, data):
393470
style = self.parse_dict_string(span['style'])
394471
self.add_styles_to_run(style)
395472

396-
397-
# add font style
473+
# add font style and name
398474
for tag in self.tags:
399-
if tag in fonts:
400-
font_style = fonts[tag]
475+
if tag in font_styles:
476+
font_style = font_styles[tag]
401477
setattr(self.run.font, font_style, True)
402478

479+
if tag in font_names:
480+
font_name = font_names[tag]
481+
self.run.font.name = font_name
482+
403483
def ignore_nested_tables(self, tables_soup):
404484
"""
405485
Returns array containing only the highest level tables
@@ -446,9 +526,7 @@ def get_tables(self):
446526
def run_process(self, html):
447527
if self.bs and BeautifulSoup:
448528
self.soup = BeautifulSoup(html, 'html.parser')
449-
html = remove_whitespace(str(self.soup))
450-
else:
451-
html = remove_whitespace(html)
529+
html = str(self.soup)
452530
if self.include_tables:
453531
self.get_tables()
454532
self.feed(html)

tests/code.html

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<h1>Code Test</h1>
2+
3+
<p>This test validates handling of code blocks.</p>
4+
5+
<h2>Pre-formatted block</h2>
6+
7+
<pre>
8+
This is a pre-formatted block.
9+
That should be pre-formatted.
10+
Retaining any carriage returns, and all white space.
11+
12+
And blank lines.
13+
</pre>
14+
15+
<h2>Code block</h2>
16+
17+
<p><code>
18+
This is a code block.
19+
That should be NOT be pre-formatted.
20+
It should NOT retain carriage returns, or all white space.
21+
22+
or blank lines.
23+
</code></p>
24+
25+
<h2>Code elements</h2>
26+
27+
<p>This is a sentence that includes <code>code</code> elements.</p>

tests/test.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,5 +95,47 @@ def test_add_html_to_cells_method(self):
9595
html = '''<p>Line 0 with p tags</p>Line 1 without p tags'''
9696
self.parser.add_html_to_cell(html, cell)
9797

98+
def test_inline_code(self):
99+
self.document.add_heading(
100+
'Test: inline code block',
101+
level=1
102+
)
103+
104+
html = "<p>This is a sentence that contains <code>some code elements</code> that " \
105+
"should appear as code.</p>"
106+
self.parser.add_html_to_document(html, self.document)
107+
108+
def test_code_block(self):
109+
self.document.add_heading(
110+
'Test: code block',
111+
level=1
112+
)
113+
114+
html = """<p><code>
115+
This is a code block.
116+
That should be NOT be pre-formatted.
117+
It should NOT retain carriage returns,
118+
119+
or blank lines.
120+
</code></p>"""
121+
self.parser.add_html_to_document(html, self.document)
122+
123+
def test_pre_block(self):
124+
self.document.add_heading(
125+
'Test: pre block',
126+
level=1
127+
)
128+
129+
html = """<pre>
130+
This is a pre-formatted block.
131+
That should be pre-formatted.
132+
Retaining any carriage returns,
133+
134+
and blank lines.
135+
</pre>
136+
"""
137+
self.parser.add_html_to_document(html, self.document)
138+
139+
98140
if __name__ == '__main__':
99141
unittest.main()

tests/test_code.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import os
2+
from .context import HtmlToDocx, test_dir
3+
4+
# Manual test (requires inspection of result) for converting code and pre blocks.
5+
6+
filename = os.path.join(test_dir, 'code.html')
7+
d = HtmlToDocx()
8+
9+
d.parse_html_file(filename)

0 commit comments

Comments
 (0)