expand scope of arxiv identifier matcher

The simple part of this is allowing 'arxiv:' in addition to 'arXiv:'. The more complex second part is to conservatively match "old" (pre-2008) style identifiers which do not have a prefix. The conservative matching is because there is less confidence that a string is actually an arxiv identifier without the prefix. Explicit collection prefixes are included (for those that existed pre-2008), internal whitespace is not allowed, and the identifier must be separated from other alphabetic strings.
kermitt2 · bnewbold · Nov 13, 2021 · Nov 13, 2021 · Nov 13, 2021 · Nov 13, 2021
commit 26204e884485f51a4f7e5e7808464dd9431acf9c
diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java
@@ -58,8 +58,12 @@ public class TextUtilities {
 
     // a regular expression for arXiv identifiers
     // see https://arxiv.org/help/arxiv_identifier and https://arxiv.org/help/arxiv_identifier_for_services
+    // three pattern types are allowed, here are examples of each
+    //   "new style" with prefix: 'arXiv:0706.0002v3', 'arxiv: 0706.0002'
+    //   "old style" with prefix: 'arXiv : hep-th/9901001v2', 'arxiv:hep-th/ 9901001'
+    //   "old style" without prefix (strict): 'hep-th/9901001v2', 'math/9901001'
     static public final Pattern arXivPattern = Pattern
-        .compile("(arXiv\\s?(\\.org)?\\s?\\:\\s?\\d{4}\\s?\\.\\s?\\d{4,5}(v\\d+)?)|(arXiv\\s?(\\.org)?\\s?\\:\\s?[ a-zA-Z\\-\\.]*\\s?/\\s?\\d{7}(v\\d+)?)");
+        .compile("(ar[xX]iv\\s?(\\.org)?\\s?\\:\\s??\\d{4}\\s?\\.\\s?\\d{4,5}(v\\d+)?)|(ar[xX]iv\\s?(\\.org)?\\s?\\:\\s?[ a-zA-Z\\-\\.]{3,16}\\s?/\\s?\\d{7}(v\\d+)?)|([^a-zA-Z](math|hep|astro|cond|gr|nucl|quat|stat|physics|cs|nlim|q\\-bio|q\\-fin)[a-zA-Z\\-\\.]*/\\d{7}(v\\d+)?)");
 
     // regular expression for PubMed identifiers, last group gives the PMID digits
     static public final Pattern pmidPattern = Pattern.compile("((PMID)|(Pub(\\s)?Med(\\s)?(ID)?))(\\s)?(\\:)?(\\s)*(\\d{1,8})");

diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java
@@ -345,6 +345,18 @@ public void testInArXivPatternLayoutToken2() {
         assertThat(positions.get(0).end, is(15));
     }
 
+    @Test
+    public void testInArXivPatternLayoutToken3() {
+        String piece = "K.R. Dienes, C. Kolda and J. March-Russell, hep-ph/9610479.";
+        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece);
+        String text = LayoutTokensUtil.toText(tokens);
+        List<OffsetPosition> positions = target.tokenPositionsArXivPattern(tokens, text);
+
+        assertThat(positions, hasSize(1));
+        assertThat(positions.get(0).start, is(22));
+        assertThat(positions.get(0).end, is(27));
+    }
+
     @Test
     public void testInIdentifierPatternLayoutToken() {
         String piece = "ATLAS collaboration, Measurements of the Nuclear Modification Factor for Jets in Pb+Pb Collisionsat √ "+
@@ -396,4 +408,4 @@ public void testInEmailPatternLayoutToken() {
         assertThat(positions.get(1).start, is(27));
         assertThat(positions.get(1).end, is(33));
     }
-}
+}