Refactor AutolinkExtensionSyntax (#471)
* Refactor AutolinkExtensionSyntax
Fix https://github.com/dart-lang/markdown/issues/470
* Improve readability
* optimise a bit
* Fix some requests from reveiw
* Update gfm_stats.txt
* An optimisation.
diff --git a/lib/src/inline_syntaxes/autolink_extension_syntax.dart b/lib/src/inline_syntaxes/autolink_extension_syntax.dart
index ff2bc59..152b9a4 100644
--- a/lib/src/inline_syntaxes/autolink_extension_syntax.dart
+++ b/lib/src/inline_syntaxes/autolink_extension_syntax.dart
@@ -3,134 +3,128 @@
// BSD-style license that can be found in the LICENSE file.
import '../ast.dart';
+import '../charcode.dart';
import '../inline_parser.dart';
import '../util.dart';
import 'inline_syntax.dart';
-/// Matches autolinks like `http://foo.com`.
+/// Matches autolinks like `http://foo.com` and `[email protected]`.
class AutolinkExtensionSyntax extends InlineSyntax {
- /// Broken up parts of the autolink regex for reusability and readability
+ static const _linkPattern =
+ // Autolinks can only come at the beginning of a line, after whitespace,
+ // or any of the delimiting characters *, _, ~, and (.
+ r'(?<=^|[\s*_~(>])'
- // Autolinks can only come at the beginning of a line, after whitespace, or
- // any of the delimiting characters *, _, ~, and (.
- static const start = r'(?:^|[\s*_~(>])';
+ // An extended url autolink will be recognised when one of the schemes
+ // http://, or https://, followed by a valid domain. See
+ // https://github.github.com/gfm/#extended-url-autolink.
+ r'(?:(?:https?|ftp):\/\/|www\.)'
- // An extended url autolink will be recognized when one of the schemes
- // http://, https://, or ftp://, followed by a valid domain
- static const scheme = r'(?:(?:https?|ftp):\/\/|www\.)';
+ // A valid domain consists of segments of alphanumeric characters,
+ // underscores (_) and hyphens (-) separated by periods (.). There must
+ // be at least one period, and no underscores may be present in the last
+ // two segments of the domain. See
+ // https://github.github.com/gfm/#valid-domain.
+ r'(?:[-_a-z0-9]+\.)*(?:[-a-z0-9]+\.[-a-z0-9]+)'
- // A valid domain consists of alphanumeric characters, underscores (_),
- // hyphens (-) and periods (.). There must be at least one period, and no
- // underscores may be present in the last two segments of the domain.
- static const domainPart = r'\w\-';
- static const domain = '[$domainPart][$domainPart.]+';
+ // After a valid domain, zero or more non-space non-< characters may
+ // follow.
+ r'[^\s<]*'
- // A valid domain consists of alphanumeric characters, underscores (_),
- // hyphens (-) and periods (.).
- static const path = r'[^\s<]*';
+ // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
+ // not be considered part of the autolink, though they may be included in
+ // the interior of the link. See
+ // https://github.github.com/gfm/#extended-autolink-path-validation.
+ '(?<![?!.,:*_~])';
- // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will not
- // be considered part of the autolink
- static const truncatingPunctuationPositive = '[?!.,:*_~]';
+ // An extended email autolink, see
+ // https://github.github.com/gfm/#extended-email-autolink.
+ static const _emailPattern =
+ r'[-_.+a-z0-9]+@(?:[-_a-z0-9]+\.)+[-_a-z0-9]*[a-z0-9](?![-_])';
- static final regExpTrailingPunc = RegExp('$truncatingPunctuationPositive*\$');
- static final regExpEndsWithColon = RegExp(r'\&[a-zA-Z0-9]+;$');
- static final regExpWhiteSpace = RegExp(r'\s');
-
- AutolinkExtensionSyntax() : super('$start(($scheme)($domain)($path))');
+ AutolinkExtensionSyntax()
+ : super(
+ '($_linkPattern)|($_emailPattern)',
+ caseSensitive: false,
+ );
@override
bool tryMatch(InlineParser parser, [int? startMatchPos]) {
- return super.tryMatch(parser, parser.pos > 0 ? parser.pos - 1 : 0);
+ startMatchPos ??= parser.pos;
+ final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
+ if (startMatch == null) {
+ return false;
+ }
+ parser.writeText();
+ return onMatch(parser, startMatch);
}
@override
bool onMatch(InlineParser parser, Match match) {
- var url = match[1]!;
- var href = url;
- var matchLength = url.length;
+ int consumeLength;
- if (url[0] == '>' || url.startsWith(regExpWhiteSpace)) {
- url = url.substring(1, url.length - 1);
- href = href.substring(1, href.length - 1);
- parser.pos++;
- matchLength--;
+ final isEmailLink = match[2] != null;
+ if (isEmailLink) {
+ consumeLength = match.match.length;
+ } else {
+ consumeLength = _getConsumeLength(match.match);
}
- // Prevent accidental standard autolink matches
- if (url.endsWith('>') && parser.source[parser.pos - 1] == '<') {
- return false;
+ var text = match.match.substring(0, consumeLength);
+ text = parser.encodeHtml ? escapeHtml(text) : text;
+
+ var destination = text;
+ if (isEmailLink) {
+ destination = 'mailto:$destination';
+ } else if (destination[0] == 'w') {
+ // When there is no scheme specified, insert the scheme `http`.
+ destination = 'http://$destination';
}
- // When an autolink ends in ), we scan the entire autolink for the total
- // number of parentheses. If there is a greater number of closing
- // parentheses than opening ones, we don’t consider the last character
- // part of the autolink, in order to facilitate including an autolink
- // inside a parenthesis:
- // https://github.github.com/gfm/#example-600
- if (url.endsWith(')')) {
- final opening = _countChars(url, '(');
- final closing = _countChars(url, ')');
+ final anchor = Element.text('a', text)
+ ..attributes['href'] = Uri.encodeFull(destination);
- if (closing > opening) {
- url = url.substring(0, url.length - 1);
- href = href.substring(0, href.length - 1);
- matchLength--;
- }
- }
+ parser
+ ..addNode(anchor)
+ ..consume(consumeLength);
- // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
- // not be considered part of the autolink, though they may be included
- // in the interior of the link:
- // https://github.github.com/gfm/#example-599
- final trailingPunc = regExpTrailingPunc.firstMatch(url);
- if (trailingPunc != null) {
- final trailingLength = trailingPunc.match.length;
- url = url.substring(0, url.length - trailingLength);
- href = href.substring(0, href.length - trailingLength);
- matchLength -= trailingLength;
- }
-
- // If an autolink ends in a semicolon (;), we check to see if it appears
- // to resemble an
- // [entity reference](https://github.github.com/gfm/#entity-references);
- // if the preceding text is & followed by one or more alphanumeric
- // characters. If so, it is excluded from the autolink:
- // https://github.github.com/gfm/#example-602
- if (url.endsWith(';')) {
- final entityRef = regExpEndsWithColon.firstMatch(url);
- if (entityRef != null) {
- // Strip out HTML entity reference
- final entityRefLength = entityRef.match.length;
- url = url.substring(0, url.length - entityRefLength);
- href = href.substring(0, href.length - entityRefLength);
- matchLength -= entityRefLength;
- }
- }
-
- // The scheme http will be inserted automatically
- if (!href.startsWith('http://') &&
- !href.startsWith('https://') &&
- !href.startsWith('ftp://')) {
- href = 'http://$href';
- }
-
- final text = parser.encodeHtml ? escapeHtml(url) : url;
- final anchor = Element.text('a', text);
- anchor.attributes['href'] = Uri.encodeFull(href);
- parser.addNode(anchor);
-
- parser.consume(matchLength);
- return false;
+ return true;
}
- int _countChars(String input, String char) {
- var count = 0;
+ int _getConsumeLength(String text) {
+ var excludedLength = 0;
- for (var i = 0; i < input.length; i++) {
- if (input[i] == char) count++;
+ // When an autolink ends in `)`, see
+ // https://github.github.com/gfm/#example-625.
+ if (text.endsWith(')')) {
+ final match = RegExp(r'(\(.*)?(\)+)$').firstMatch(text)!;
+
+ if (match[1] == null) {
+ excludedLength = match[2]!.length;
+ } else {
+ var parenCount = 0;
+ for (var i = 0; i < text.length; i++) {
+ final char = text.codeUnitAt(i);
+ if (char == $lparen) {
+ parenCount++;
+ } else if (char == $rparen) {
+ parenCount--;
+ }
+ }
+ if (parenCount < 0) {
+ excludedLength = parenCount.abs();
+ }
+ }
+ }
+ // If an autolink ends in a semicolon `;`, see
+ // https://github.github.com/gfm/#example-627
+ else if (text.endsWith(';')) {
+ final match = RegExp(r'&[0-9a-z]+;$').firstMatch(text);
+ if (match != null) {
+ excludedLength = match.match.length;
+ }
}
- return count;
+ return text.length - excludedLength;
}
}
diff --git a/test/gfm/autolinks.unit b/test/gfm/autolinks.unit
index d4c803b..4de9122 100644
--- a/test/gfm/autolinks.unit
+++ b/test/gfm/autolinks.unit
@@ -57,7 +57,7 @@
>>> Autolinks - 616
< http://foo.bar >
<<<
-<p>< <a href="http://foo.bar">http://foo.bar</a> ></p>
+<p>< http://foo.bar ></p>
>>> Autolinks - 617
<m:abc>
<<<
@@ -69,7 +69,7 @@
>>> Autolinks - 619
http://example.com
<<<
-<p><a href="http://example.com">http://example.com</a></p>
+<p>http://example.com</p>
>>> Autolinks - 620
[email protected]
<<<
diff --git a/test/gfm/autolinks_extension.unit b/test/gfm/autolinks_extension.unit
index ac41184..024fde1 100644
--- a/test/gfm/autolinks_extension.unit
+++ b/test/gfm/autolinks_extension.unit
@@ -23,9 +23,9 @@
(www.google.com/search?q=Markup+(business)
<<<
<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
-<p><a href="http://www.google.com/search?q=Markup+(business))">www.google.com/search?q=Markup+(business))</a>)</p>
<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
-<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
>>> Autolinks (extension) - 625
www.google.com/search?q=(business))+ok
<<<
@@ -35,7 +35,7 @@
www.google.com/search?q=commonmark&hl;
<<<
-<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&hl=en</a></p>
+<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&hl=en</a></p>
<p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p>
>>> Autolinks (extension) - 627
www.commonmark.org/he<lp
@@ -54,11 +54,11 @@
>>> Autolinks (extension) - 629
[email protected]
<<<
-<p>[email protected]</p>
+<p><a href="mailto:[email protected]">[email protected]</a></p>
>>> Autolinks (extension) - 630
hello@mail+xyz.example isn't valid, but [email protected] is.
<<<
-<p>hello@mail+xyz.example isn't valid, but [email protected] is.</p>
+<p>hello@mail+xyz.example isn't valid, but <a href="mailto:[email protected]">[email protected]</a> is.</p>
>>> Autolinks (extension) - 631
[email protected]
@@ -68,7 +68,7 @@
[email protected]_
<<<
-<p>[email protected]</p>
-<p>[email protected].</p>
+<p><a href="mailto:[email protected]">[email protected]</a></p>
+<p><a href="mailto:[email protected]">[email protected]</a>.</p>
<p>[email protected]</p>
<p>[email protected]_</p>
diff --git a/test/markdown_test.dart b/test/markdown_test.dart
index 30142df..35fde7e 100644
--- a/test/markdown_test.dart
+++ b/test/markdown_test.dart
@@ -55,7 +55,7 @@
);
testDirectory('common_mark');
- testDirectory('gfm', extensionSet: ExtensionSet.gitHubFlavored);
+ testDirectory('gfm');
group('Corner cases', () {
validateCore('Incorrect Links', '''
diff --git a/test/util.dart b/test/util.dart
index b38b264..fba598d 100644
--- a/test/util.dart
+++ b/test/util.dart
@@ -14,11 +14,40 @@
for (final dataCase in dataCasesUnder(testDirectory: name)) {
final description =
'${dataCase.directory}/${dataCase.file}.unit ${dataCase.description}';
+
+ final inlineSyntaxes = <InlineSyntax>[];
+ final blockSyntaxes = <BlockSyntax>[];
+
+ if (dataCase.file.endsWith('_extension')) {
+ final extension = dataCase.file.substring(
+ 0,
+ dataCase.file.lastIndexOf('_extension'),
+ );
+ switch (extension) {
+ case 'autolinks':
+ inlineSyntaxes.add(AutolinkExtensionSyntax());
+ break;
+ case 'strikethrough':
+ inlineSyntaxes.add(StrikethroughSyntax());
+ break;
+ case 'tables':
+ blockSyntaxes.add(const TableSyntax());
+ break;
+ case 'disallowed_raw_html':
+ // TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
+ break;
+ default:
+ throw UnimplementedError('Unimplemented extension "$extension"');
+ }
+ }
+
validateCore(
description,
dataCase.input,
dataCase.expectedOutput,
extensionSet: extensionSet,
+ inlineSyntaxes: inlineSyntaxes,
+ blockSyntaxes: blockSyntaxes,
);
}
}
diff --git a/tool/gfm_stats.json b/tool/gfm_stats.json
index 9627e27..1460d3e 100644
--- a/tool/gfm_stats.json
+++ b/tool/gfm_stats.json
@@ -34,24 +34,24 @@
"613": "strict",
"614": "strict",
"615": "strict",
- "616": "fail",
+ "616": "strict",
"617": "strict",
"618": "strict",
- "619": "fail",
+ "619": "strict",
"620": "strict"
},
"Autolinks (extension)": {
"621": "strict",
"622": "strict",
"623": "strict",
- "624": "fail",
+ "624": "strict",
"625": "strict",
"626": "loose",
"627": "strict",
"628": "strict",
- "629": "fail",
- "630": "fail",
- "631": "fail"
+ "629": "strict",
+ "630": "strict",
+ "631": "strict"
},
"Backslash escapes": {
"308": "loose",
diff --git a/tool/gfm_stats.txt b/tool/gfm_stats.txt
index 8204a44..c455a49 100644
--- a/tool/gfm_stats.txt
+++ b/tool/gfm_stats.txt
@@ -1,6 +1,6 @@
17 of 18 – 94.4% ATX headings
- 17 of 19 – 89.5% Autolinks
- 7 of 11 – 63.6% Autolinks (extension)
+ 19 of 19 – 100.0% Autolinks
+ 11 of 11 – 100.0% Autolinks (extension)
12 of 13 – 92.3% Backslash escapes
1 of 1 – 100.0% Blank lines
23 of 25 – 92.0% Block quotes
@@ -28,5 +28,5 @@
11 of 11 – 100.0% Tabs
3 of 3 – 100.0% Textual content
19 of 19 – 100.0% Thematic breaks
- 634 of 671 – 94.5% TOTAL
- 569 of 634 – 89.7% TOTAL Strict
+ 640 of 671 – 95.4% TOTAL
+ 575 of 640 – 89.8% TOTAL Strict
diff --git a/tool/stats.dart b/tool/stats.dart
index 6fe481c..4f952bb 100644
--- a/tool/stats.dart
+++ b/tool/stats.dart
@@ -136,6 +136,7 @@
e,
verboseFail: verbose,
verboseLooseMatch: verboseLooseMatch,
+ extensions: e.extensions,
);
units.add(DataCase(
diff --git a/tool/stats_lib.dart b/tool/stats_lib.dart
index 125b34e..c1298f1 100644
--- a/tool/stats_lib.dart
+++ b/tool/stats_lib.dart
@@ -8,7 +8,14 @@
import 'package:html/dom.dart' show Element;
import 'package:html/parser.dart' show parseFragment;
-import 'package:markdown/markdown.dart' show markdownToHtml, ExtensionSet;
+import 'package:markdown/markdown.dart'
+ show
+ markdownToHtml,
+ InlineSyntax,
+ BlockSyntax,
+ AutolinkExtensionSyntax,
+ StrikethroughSyntax,
+ TableSyntax;
import 'package:path/path.dart' as p;
import '../test/util.dart';
@@ -54,19 +61,16 @@
static final Config commonMarkConfig = Config._(
'common_mark',
'http://spec.commonmark.org/0.28/',
- null,
);
static final Config gfmConfig = Config._(
'gfm',
'https://github.github.com/gfm/',
- ExtensionSet.gitHubFlavored,
);
final String prefix;
final String baseUrl;
- final ExtensionSet? extensionSet;
- Config._(this.prefix, this.baseUrl, this.extensionSet);
+ Config._(this.prefix, this.baseUrl);
}
class CommonMarkTestCase {
@@ -76,6 +80,7 @@
final String html;
final int startLine;
final int endLine;
+ final Set<String> extensions;
CommonMarkTestCase(
this.example,
@@ -84,6 +89,7 @@
this.endLine,
this.markdown,
this.html,
+ this.extensions,
);
factory CommonMarkTestCase.fromJson(Map<String, dynamic> json) {
@@ -94,6 +100,9 @@
json['end_line'] as int,
json['markdown'] as String /*!*/,
json['html'] as String,
+ json['extensions'] == null
+ ? const {}
+ : Set.from(json['extensions'] as List),
);
}
@@ -117,11 +126,37 @@
bool throwOnError = false,
bool verboseFail = false,
bool verboseLooseMatch = false,
+ Set<String> extensions = const {},
}) {
String output;
+ final inlineSyntaxes = <InlineSyntax>[];
+ final blockSyntaxes = <BlockSyntax>[];
+
+ for (final extension in extensions) {
+ switch (extension) {
+ case 'autolink':
+ inlineSyntaxes.add(AutolinkExtensionSyntax());
+ break;
+ case 'strikethrough':
+ inlineSyntaxes.add(StrikethroughSyntax());
+ break;
+ case 'table':
+ blockSyntaxes.add(const TableSyntax());
+ break;
+ case 'tagfilter':
+ // TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
+ break;
+ default:
+ throw UnimplementedError('Unimplemented extension "$extension"');
+ }
+ }
+
try {
- output =
- markdownToHtml(testCase.markdown, extensionSet: config.extensionSet);
+ output = markdownToHtml(
+ testCase.markdown,
+ inlineSyntaxes: inlineSyntaxes,
+ blockSyntaxes: blockSyntaxes,
+ );
} catch (err, stackTrace) {
if (throwOnError) {
rethrow;