Refactor AutolinkExtensionSyntax (#471)

* Refactor AutolinkExtensionSyntax

Fix https://github.com/dart-lang/markdown/issues/470

* Improve readability

* optimise a bit

* Fix some requests from reveiw

* Update gfm_stats.txt

* An optimisation.
diff --git a/lib/src/inline_syntaxes/autolink_extension_syntax.dart b/lib/src/inline_syntaxes/autolink_extension_syntax.dart
index ff2bc59..152b9a4 100644
--- a/lib/src/inline_syntaxes/autolink_extension_syntax.dart
+++ b/lib/src/inline_syntaxes/autolink_extension_syntax.dart
@@ -3,134 +3,128 @@
 // BSD-style license that can be found in the LICENSE file.
 
 import '../ast.dart';
+import '../charcode.dart';
 import '../inline_parser.dart';
 import '../util.dart';
 import 'inline_syntax.dart';
 
-/// Matches autolinks like `http://foo.com`.
+/// Matches autolinks like `http://foo.com` and `[email protected]`.
 class AutolinkExtensionSyntax extends InlineSyntax {
-  /// Broken up parts of the autolink regex for reusability and readability
+  static const _linkPattern =
+      // Autolinks can only come at the beginning of a line, after whitespace,
+      // or any of the delimiting characters *, _, ~, and (.
+      r'(?<=^|[\s*_~(>])'
 
-  // Autolinks can only come at the beginning of a line, after whitespace, or
-  // any of the delimiting characters *, _, ~, and (.
-  static const start = r'(?:^|[\s*_~(>])';
+      // An extended url autolink will be recognised when one of the schemes
+      // http://, or https://, followed by a valid domain. See
+      // https://github.github.com/gfm/#extended-url-autolink.
+      r'(?:(?:https?|ftp):\/\/|www\.)'
 
-  // An extended url autolink will be recognized when one of the schemes
-  // http://, https://, or ftp://, followed by a valid domain
-  static const scheme = r'(?:(?:https?|ftp):\/\/|www\.)';
+      // A valid domain consists of segments of alphanumeric characters,
+      // underscores (_) and hyphens (-) separated by periods (.). There must
+      // be at least one period, and no underscores may be present in the last
+      // two segments of the domain. See
+      // https://github.github.com/gfm/#valid-domain.
+      r'(?:[-_a-z0-9]+\.)*(?:[-a-z0-9]+\.[-a-z0-9]+)'
 
-  // A valid domain consists of alphanumeric characters, underscores (_),
-  // hyphens (-) and periods (.). There must be at least one period, and no
-  // underscores may be present in the last two segments of the domain.
-  static const domainPart = r'\w\-';
-  static const domain = '[$domainPart][$domainPart.]+';
+      // After a valid domain, zero or more non-space non-< characters may
+      // follow.
+      r'[^\s<]*'
 
-  // A valid domain consists of alphanumeric characters, underscores (_),
-  // hyphens (-) and periods (.).
-  static const path = r'[^\s<]*';
+      // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
+      // not be considered part of the autolink, though they may be included in
+      // the interior of the link. See
+      // https://github.github.com/gfm/#extended-autolink-path-validation.
+      '(?<![?!.,:*_~])';
 
-  // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will not
-  // be considered part of the autolink
-  static const truncatingPunctuationPositive = '[?!.,:*_~]';
+  // An extended email autolink, see
+  // https://github.github.com/gfm/#extended-email-autolink.
+  static const _emailPattern =
+      r'[-_.+a-z0-9]+@(?:[-_a-z0-9]+\.)+[-_a-z0-9]*[a-z0-9](?![-_])';
 
-  static final regExpTrailingPunc = RegExp('$truncatingPunctuationPositive*\$');
-  static final regExpEndsWithColon = RegExp(r'\&[a-zA-Z0-9]+;$');
-  static final regExpWhiteSpace = RegExp(r'\s');
-
-  AutolinkExtensionSyntax() : super('$start(($scheme)($domain)($path))');
+  AutolinkExtensionSyntax()
+      : super(
+          '($_linkPattern)|($_emailPattern)',
+          caseSensitive: false,
+        );
 
   @override
   bool tryMatch(InlineParser parser, [int? startMatchPos]) {
-    return super.tryMatch(parser, parser.pos > 0 ? parser.pos - 1 : 0);
+    startMatchPos ??= parser.pos;
+    final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
+    if (startMatch == null) {
+      return false;
+    }
+    parser.writeText();
+    return onMatch(parser, startMatch);
   }
 
   @override
   bool onMatch(InlineParser parser, Match match) {
-    var url = match[1]!;
-    var href = url;
-    var matchLength = url.length;
+    int consumeLength;
 
-    if (url[0] == '>' || url.startsWith(regExpWhiteSpace)) {
-      url = url.substring(1, url.length - 1);
-      href = href.substring(1, href.length - 1);
-      parser.pos++;
-      matchLength--;
+    final isEmailLink = match[2] != null;
+    if (isEmailLink) {
+      consumeLength = match.match.length;
+    } else {
+      consumeLength = _getConsumeLength(match.match);
     }
 
-    // Prevent accidental standard autolink matches
-    if (url.endsWith('>') && parser.source[parser.pos - 1] == '<') {
-      return false;
+    var text = match.match.substring(0, consumeLength);
+    text = parser.encodeHtml ? escapeHtml(text) : text;
+
+    var destination = text;
+    if (isEmailLink) {
+      destination = 'mailto:$destination';
+    } else if (destination[0] == 'w') {
+      // When there is no scheme specified, insert the scheme `http`.
+      destination = 'http://$destination';
     }
 
-    // When an autolink ends in ), we scan the entire autolink for the total
-    // number of parentheses. If there is a greater number of closing
-    // parentheses than opening ones, we don’t consider the last character
-    // part of the autolink, in order to facilitate including an autolink
-    // inside a parenthesis:
-    // https://github.github.com/gfm/#example-600
-    if (url.endsWith(')')) {
-      final opening = _countChars(url, '(');
-      final closing = _countChars(url, ')');
+    final anchor = Element.text('a', text)
+      ..attributes['href'] = Uri.encodeFull(destination);
 
-      if (closing > opening) {
-        url = url.substring(0, url.length - 1);
-        href = href.substring(0, href.length - 1);
-        matchLength--;
-      }
-    }
+    parser
+      ..addNode(anchor)
+      ..consume(consumeLength);
 
-    // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
-    // not be considered part of the autolink, though they may be included
-    // in the interior of the link:
-    // https://github.github.com/gfm/#example-599
-    final trailingPunc = regExpTrailingPunc.firstMatch(url);
-    if (trailingPunc != null) {
-      final trailingLength = trailingPunc.match.length;
-      url = url.substring(0, url.length - trailingLength);
-      href = href.substring(0, href.length - trailingLength);
-      matchLength -= trailingLength;
-    }
-
-    // If an autolink ends in a semicolon (;), we check to see if it appears
-    // to resemble an
-    // [entity reference](https://github.github.com/gfm/#entity-references);
-    // if the preceding text is & followed by one or more alphanumeric
-    // characters. If so, it is excluded from the autolink:
-    // https://github.github.com/gfm/#example-602
-    if (url.endsWith(';')) {
-      final entityRef = regExpEndsWithColon.firstMatch(url);
-      if (entityRef != null) {
-        // Strip out HTML entity reference
-        final entityRefLength = entityRef.match.length;
-        url = url.substring(0, url.length - entityRefLength);
-        href = href.substring(0, href.length - entityRefLength);
-        matchLength -= entityRefLength;
-      }
-    }
-
-    // The scheme http will be inserted automatically
-    if (!href.startsWith('http://') &&
-        !href.startsWith('https://') &&
-        !href.startsWith('ftp://')) {
-      href = 'http://$href';
-    }
-
-    final text = parser.encodeHtml ? escapeHtml(url) : url;
-    final anchor = Element.text('a', text);
-    anchor.attributes['href'] = Uri.encodeFull(href);
-    parser.addNode(anchor);
-
-    parser.consume(matchLength);
-    return false;
+    return true;
   }
 
-  int _countChars(String input, String char) {
-    var count = 0;
+  int _getConsumeLength(String text) {
+    var excludedLength = 0;
 
-    for (var i = 0; i < input.length; i++) {
-      if (input[i] == char) count++;
+    // When an autolink ends in `)`, see
+    // https://github.github.com/gfm/#example-625.
+    if (text.endsWith(')')) {
+      final match = RegExp(r'(\(.*)?(\)+)$').firstMatch(text)!;
+
+      if (match[1] == null) {
+        excludedLength = match[2]!.length;
+      } else {
+        var parenCount = 0;
+        for (var i = 0; i < text.length; i++) {
+          final char = text.codeUnitAt(i);
+          if (char == $lparen) {
+            parenCount++;
+          } else if (char == $rparen) {
+            parenCount--;
+          }
+        }
+        if (parenCount < 0) {
+          excludedLength = parenCount.abs();
+        }
+      }
+    }
+    // If an autolink ends in a semicolon `;`, see
+    // https://github.github.com/gfm/#example-627
+    else if (text.endsWith(';')) {
+      final match = RegExp(r'&[0-9a-z]+;$').firstMatch(text);
+      if (match != null) {
+        excludedLength = match.match.length;
+      }
     }
 
-    return count;
+    return text.length - excludedLength;
   }
 }
diff --git a/test/gfm/autolinks.unit b/test/gfm/autolinks.unit
index d4c803b..4de9122 100644
--- a/test/gfm/autolinks.unit
+++ b/test/gfm/autolinks.unit
@@ -57,7 +57,7 @@
 >>> Autolinks - 616
 < http://foo.bar >
 <<<
-<p>&lt; <a href="http://foo.bar">http://foo.bar</a> &gt;</p>
+<p>&lt; http://foo.bar &gt;</p>
 >>> Autolinks - 617
 <m:abc>
 <<<
@@ -69,7 +69,7 @@
 >>> Autolinks - 619
 http://example.com
 <<<
-<p><a href="http://example.com">http://example.com</a></p>
+<p>http://example.com</p>
 >>> Autolinks - 620
 [email protected]
 <<<
diff --git a/test/gfm/autolinks_extension.unit b/test/gfm/autolinks_extension.unit
index ac41184..024fde1 100644
--- a/test/gfm/autolinks_extension.unit
+++ b/test/gfm/autolinks_extension.unit
@@ -23,9 +23,9 @@
 (www.google.com/search?q=Markup+(business)
 <<<
 <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
-<p><a href="http://www.google.com/search?q=Markup+(business))">www.google.com/search?q=Markup+(business))</a>)</p>
 <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
-<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
 >>> Autolinks (extension) - 625
 www.google.com/search?q=(business))+ok
 <<<
@@ -35,7 +35,7 @@
 
 www.google.com/search?q=commonmark&hl;
 <<<
-<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
+<p><a href="http://www.google.com/search?q=commonmark&amp;hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
 <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p>
 >>> Autolinks (extension) - 627
 www.commonmark.org/he<lp
@@ -54,11 +54,11 @@
 >>> Autolinks (extension) - 629
 [email protected]
 <<<
-<p>[email protected]</p>
+<p><a href="mailto:[email protected]">[email protected]</a></p>
 >>> Autolinks (extension) - 630
 hello@mail+xyz.example isn't valid, but [email protected] is.
 <<<
-<p>hello@mail+xyz.example isn't valid, but [email protected] is.</p>
+<p>hello@mail+xyz.example isn't valid, but <a href="mailto:[email protected]">[email protected]</a> is.</p>
 >>> Autolinks (extension) - 631
 [email protected]
 
@@ -68,7 +68,7 @@
 
 [email protected]_
 <<<
-<p>[email protected]</p>
-<p>[email protected].</p>
+<p><a href="mailto:[email protected]">[email protected]</a></p>
+<p><a href="mailto:[email protected]">[email protected]</a>.</p>
 <p>[email protected]</p>
 <p>[email protected]_</p>
diff --git a/test/markdown_test.dart b/test/markdown_test.dart
index 30142df..35fde7e 100644
--- a/test/markdown_test.dart
+++ b/test/markdown_test.dart
@@ -55,7 +55,7 @@
   );
 
   testDirectory('common_mark');
-  testDirectory('gfm', extensionSet: ExtensionSet.gitHubFlavored);
+  testDirectory('gfm');
 
   group('Corner cases', () {
     validateCore('Incorrect Links', '''
diff --git a/test/util.dart b/test/util.dart
index b38b264..fba598d 100644
--- a/test/util.dart
+++ b/test/util.dart
@@ -14,11 +14,40 @@
   for (final dataCase in dataCasesUnder(testDirectory: name)) {
     final description =
         '${dataCase.directory}/${dataCase.file}.unit ${dataCase.description}';
+
+    final inlineSyntaxes = <InlineSyntax>[];
+    final blockSyntaxes = <BlockSyntax>[];
+
+    if (dataCase.file.endsWith('_extension')) {
+      final extension = dataCase.file.substring(
+        0,
+        dataCase.file.lastIndexOf('_extension'),
+      );
+      switch (extension) {
+        case 'autolinks':
+          inlineSyntaxes.add(AutolinkExtensionSyntax());
+          break;
+        case 'strikethrough':
+          inlineSyntaxes.add(StrikethroughSyntax());
+          break;
+        case 'tables':
+          blockSyntaxes.add(const TableSyntax());
+          break;
+        case 'disallowed_raw_html':
+          // TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
+          break;
+        default:
+          throw UnimplementedError('Unimplemented extension "$extension"');
+      }
+    }
+
     validateCore(
       description,
       dataCase.input,
       dataCase.expectedOutput,
       extensionSet: extensionSet,
+      inlineSyntaxes: inlineSyntaxes,
+      blockSyntaxes: blockSyntaxes,
     );
   }
 }
diff --git a/tool/gfm_stats.json b/tool/gfm_stats.json
index 9627e27..1460d3e 100644
--- a/tool/gfm_stats.json
+++ b/tool/gfm_stats.json
@@ -34,24 +34,24 @@
   "613": "strict",
   "614": "strict",
   "615": "strict",
-  "616": "fail",
+  "616": "strict",
   "617": "strict",
   "618": "strict",
-  "619": "fail",
+  "619": "strict",
   "620": "strict"
  },
  "Autolinks (extension)": {
   "621": "strict",
   "622": "strict",
   "623": "strict",
-  "624": "fail",
+  "624": "strict",
   "625": "strict",
   "626": "loose",
   "627": "strict",
   "628": "strict",
-  "629": "fail",
-  "630": "fail",
-  "631": "fail"
+  "629": "strict",
+  "630": "strict",
+  "631": "strict"
  },
  "Backslash escapes": {
   "308": "loose",
diff --git a/tool/gfm_stats.txt b/tool/gfm_stats.txt
index 8204a44..c455a49 100644
--- a/tool/gfm_stats.txt
+++ b/tool/gfm_stats.txt
@@ -1,6 +1,6 @@
   17 of   18 –  94.4%  ATX headings
-  17 of   19 –  89.5%  Autolinks
-   7 of   11 –  63.6%  Autolinks (extension)
+  19 of   19 – 100.0%  Autolinks
+  11 of   11 – 100.0%  Autolinks (extension)
   12 of   13 –  92.3%  Backslash escapes
    1 of    1 – 100.0%  Blank lines
   23 of   25 –  92.0%  Block quotes
@@ -28,5 +28,5 @@
   11 of   11 – 100.0%  Tabs
    3 of    3 – 100.0%  Textual content
   19 of   19 – 100.0%  Thematic breaks
- 634 of  671 –  94.5%  TOTAL
- 569 of  634 –  89.7%  TOTAL Strict
+ 640 of  671 –  95.4%  TOTAL
+ 575 of  640 –  89.8%  TOTAL Strict
diff --git a/tool/stats.dart b/tool/stats.dart
index 6fe481c..4f952bb 100644
--- a/tool/stats.dart
+++ b/tool/stats.dart
@@ -136,6 +136,7 @@
         e,
         verboseFail: verbose,
         verboseLooseMatch: verboseLooseMatch,
+        extensions: e.extensions,
       );
 
       units.add(DataCase(
diff --git a/tool/stats_lib.dart b/tool/stats_lib.dart
index 125b34e..c1298f1 100644
--- a/tool/stats_lib.dart
+++ b/tool/stats_lib.dart
@@ -8,7 +8,14 @@
 
 import 'package:html/dom.dart' show Element;
 import 'package:html/parser.dart' show parseFragment;
-import 'package:markdown/markdown.dart' show markdownToHtml, ExtensionSet;
+import 'package:markdown/markdown.dart'
+    show
+        markdownToHtml,
+        InlineSyntax,
+        BlockSyntax,
+        AutolinkExtensionSyntax,
+        StrikethroughSyntax,
+        TableSyntax;
 import 'package:path/path.dart' as p;
 
 import '../test/util.dart';
@@ -54,19 +61,16 @@
   static final Config commonMarkConfig = Config._(
     'common_mark',
     'http://spec.commonmark.org/0.28/',
-    null,
   );
   static final Config gfmConfig = Config._(
     'gfm',
     'https://github.github.com/gfm/',
-    ExtensionSet.gitHubFlavored,
   );
 
   final String prefix;
   final String baseUrl;
-  final ExtensionSet? extensionSet;
 
-  Config._(this.prefix, this.baseUrl, this.extensionSet);
+  Config._(this.prefix, this.baseUrl);
 }
 
 class CommonMarkTestCase {
@@ -76,6 +80,7 @@
   final String html;
   final int startLine;
   final int endLine;
+  final Set<String> extensions;
 
   CommonMarkTestCase(
     this.example,
@@ -84,6 +89,7 @@
     this.endLine,
     this.markdown,
     this.html,
+    this.extensions,
   );
 
   factory CommonMarkTestCase.fromJson(Map<String, dynamic> json) {
@@ -94,6 +100,9 @@
       json['end_line'] as int,
       json['markdown'] as String /*!*/,
       json['html'] as String,
+      json['extensions'] == null
+          ? const {}
+          : Set.from(json['extensions'] as List),
     );
   }
 
@@ -117,11 +126,37 @@
   bool throwOnError = false,
   bool verboseFail = false,
   bool verboseLooseMatch = false,
+  Set<String> extensions = const {},
 }) {
   String output;
+  final inlineSyntaxes = <InlineSyntax>[];
+  final blockSyntaxes = <BlockSyntax>[];
+
+  for (final extension in extensions) {
+    switch (extension) {
+      case 'autolink':
+        inlineSyntaxes.add(AutolinkExtensionSyntax());
+        break;
+      case 'strikethrough':
+        inlineSyntaxes.add(StrikethroughSyntax());
+        break;
+      case 'table':
+        blockSyntaxes.add(const TableSyntax());
+        break;
+      case 'tagfilter':
+        // TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
+        break;
+      default:
+        throw UnimplementedError('Unimplemented extension "$extension"');
+    }
+  }
+
   try {
-    output =
-        markdownToHtml(testCase.markdown, extensionSet: config.extensionSet);
+    output = markdownToHtml(
+      testCase.markdown,
+      inlineSyntaxes: inlineSyntaxes,
+      blockSyntaxes: blockSyntaxes,
+    );
   } catch (err, stackTrace) {
     if (throwOnError) {
       rethrow;