From f7908c4e8421317b80ee01147a69348e810b4971 Mon Sep 17 00:00:00 2001 From: Yelisey Romanov Date: Fri, 18 Apr 2025 15:13:36 -0500 Subject: [PATCH] Add support for OCaml. fixes #4407 --- .../indexer/analysis/AnalyzerGuru.java | 2 + .../indexer/analysis/ocaml/Consts.java | 119 +++++++++ .../indexer/analysis/ocaml/OCamlAnalyzer.java | 75 ++++++ .../analysis/ocaml/OCamlAnalyzerFactory.java | 55 ++++ .../indexer/analysis/ocaml/OCamlLexer.java | 58 +++++ .../src/main/jflex/analysis/ocaml/OCaml.lexh | 131 ++++++++++ .../analysis/ocaml/OCamlSymbolTokenizer.lex | 125 +++++++++ .../main/jflex/analysis/ocaml/OCamlXref.lex | 243 ++++++++++++++++++ .../ocaml/OCamlSymbolTokenizerTest.java | 90 +++++++ .../indexer/analysis/ocaml/OCamlXrefTest.java | 179 +++++++++++++ .../test/resources/analysis/ocaml/sample.ml | 30 +++ .../test/resources/analysis/ocaml/sample2.ml | 20 ++ .../analysis/ocaml/sample2_xref.html | 24 ++ .../resources/analysis/ocaml/sample_xref.html | 34 +++ .../test/resources/analysis/ocaml/sampletags | 1 + 15 files changed, 1186 insertions(+) create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex create mode 100644 opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlSymbolTokenizerTest.java create mode 100644 opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlXrefTest.java create mode 100644 opengrok-indexer/src/test/resources/analysis/ocaml/sample.ml create mode 100644 opengrok-indexer/src/test/resources/analysis/ocaml/sample2.ml create mode 100644 opengrok-indexer/src/test/resources/analysis/ocaml/sample2_xref.html create mode 100644 opengrok-indexer/src/test/resources/analysis/ocaml/sample_xref.html create mode 100644 opengrok-indexer/src/test/resources/analysis/ocaml/sampletags diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index a8396f7fe92..49457fd16ef 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -89,6 +89,7 @@ import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory; import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory; import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory; +import org.opengrok.indexer.analysis.ocaml.OCamlAnalyzerFactory; import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory; import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory; import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory; @@ -298,6 +299,7 @@ public class AnalyzerGuru { new HaskellAnalyzerFactory(), new GolangAnalyzerFactory(), new LuaAnalyzerFactory(), + new OCamlAnalyzerFactory(), new PascalAnalyzerFactory(), new AdaAnalyzerFactory(), new RubyAnalyzerFactory(), diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java new file mode 100644 index 00000000000..905ee471688 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import java.util.HashSet; +import java.util.Set; + +/** + * Represents a container for a set of OCaml keywords. + */ +public class Consts { + + static final Set kwd = new HashSet<>(); + + static { + /* OCaml 5.3.0 keywords */ + kwd.add("and"); + kwd.add("as"); + kwd.add("assert"); + kwd.add("begin"); + kwd.add("class"); + kwd.add("constraint"); + kwd.add("do"); + kwd.add("done"); + kwd.add("downto"); + kwd.add("effect"); + kwd.add("else"); + kwd.add("end"); + kwd.add("exception"); + kwd.add("external"); + kwd.add("false"); + kwd.add("for"); + kwd.add("fun"); + kwd.add("function"); + kwd.add("functor"); + kwd.add("if"); + kwd.add("in"); + kwd.add("include"); + kwd.add("inherit"); + kwd.add("initializer"); + kwd.add("lazy"); + kwd.add("let"); + kwd.add("match"); + kwd.add("method"); + kwd.add("module"); + kwd.add("mutable"); + kwd.add("new"); + kwd.add("nonrec"); + kwd.add("object"); + kwd.add("of"); + kwd.add("open"); + kwd.add("or"); + kwd.add("parser"); + kwd.add("private"); + kwd.add("ref"); + kwd.add("rec"); + kwd.add("sig"); + kwd.add("struct"); + kwd.add("then"); + kwd.add("to"); + kwd.add("true"); + kwd.add("try"); + kwd.add("type"); + kwd.add("val"); + kwd.add("virtual"); + kwd.add("when"); + kwd.add("while"); + kwd.add("with"); + kwd.add("lor"); + kwd.add("lxor"); + kwd.add("mod"); + kwd.add("land"); + kwd.add("lsl"); + kwd.add("lsr"); + kwd.add("asr"); + + /* OCaml 5.3.0 predefined types */ + /* it is possible to make a variable of such a name, + though people mostly do not use this opportunity */ + kwd.add("bool"); + kwd.add("char"); + kwd.add("float"); + kwd.add("int"); + + kwd.add("bytes"); + kwd.add("string"); + + kwd.add("array"); + kwd.add("list"); + kwd.add("option"); + /* "result" is often a variable, so not adding */ + + kwd.add("unit"); + } + + /** Private to enforce static. */ + private Consts() { + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java new file mode 100644 index 00000000000..4f6f52df0d1 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.FileAnalyzerFactory; +import org.opengrok.indexer.analysis.JFlexTokenizer; +import org.opengrok.indexer.analysis.JFlexXref; +import org.opengrok.indexer.analysis.plain.AbstractSourceCodeAnalyzer; + +import java.io.Reader; + +/** + * Represents an analyzer for the OCaml language. + */ +@SuppressWarnings("java:S110") +public class OCamlAnalyzer extends AbstractSourceCodeAnalyzer { + + /** + * Creates a new instance of {@link OCamlAnalyzer}. + * @param factory instance + */ + protected OCamlAnalyzer(FileAnalyzerFactory factory) { + super(factory, () -> new JFlexTokenizer(new OCamlSymbolTokenizer( + AbstractAnalyzer.DUMMY_READER))); + } + + /** + * @return {@code "ocaml"} + */ + @Override + public String getCtagsLang() { + return "ocaml"; + } + + /** + * Gets a version number to be used to tag processed documents so that + * re-analysis can be re-done later if a stored version number is different + * from the current implementation. + * @return 20250403_00 + */ + @Override + protected int getSpecializedVersionNo() { + return 20250403_00; // Edit comment above too! + } + + /** + * Creates a wrapped {@link OCamlXref} instance. + * @return a defined instance + */ + @Override + protected JFlexXref newXref(Reader reader) { + return new JFlexXref(new OCamlXref(reader)); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java new file mode 100644 index 00000000000..7025ed30037 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import org.opengrok.indexer.analysis.AbstractAnalyzer.Genre; +import org.opengrok.indexer.analysis.FileAnalyzer; +import org.opengrok.indexer.analysis.FileAnalyzerFactory; + +/** + * Represents a factory to create {@link OCamlAnalyzer} instances. + */ +public class OCamlAnalyzerFactory extends FileAnalyzerFactory { + + private static final String NAME = "OCaml"; + + private static final String[] SUFFIXES = {"ML", "MLI"}; + + /** + * Initializes a factory instance to associate a file extensions ".ml", + * ".mli" with {@link OCamlAnalyzer}. + */ + public OCamlAnalyzerFactory() { + super(null, null, SUFFIXES, null, null, "text/plain", Genre.PLAIN, + NAME, true); + } + + /** + * Creates a new {@link OCamlAnalyzer} instance. + * @return a defined instance + */ + @Override + protected FileAnalyzer newAnalyzer() { + return new OCamlAnalyzer(this); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java new file mode 100644 index 00000000000..6256029ff00 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import org.opengrok.indexer.analysis.JFlexJointLexer; +import org.opengrok.indexer.analysis.JFlexSymbolMatcher; +import org.opengrok.indexer.analysis.Resettable; + +/** + * Represents an abstract base class for OCaml lexers. + */ +@SuppressWarnings("Duplicates") +abstract class OCamlLexer extends JFlexSymbolMatcher + implements JFlexJointLexer, Resettable { + + /** + * Calls {@link #phLOC()} if the yystate is not COMMENT or SCOMMENT. + */ + public void chkLOC() { + if (yystate() != COMMENT() && yystate() != SCOMMENT()) { + phLOC(); + } + } + + /** + * Subclasses must override to get the constant value created by JFlex to + * represent COMMENT. + */ + @SuppressWarnings("java:S100") + abstract int COMMENT(); + + /** + * Subclasses must override to get the constant value created by JFlex to + * represent SCOMMENT. + */ + @SuppressWarnings("java:S100") + abstract int SCOMMENT(); +} diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh new file mode 100644 index 00000000000..475b3adf5bb --- /dev/null +++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh @@ -0,0 +1,131 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ + +Identifier = ({varid} | {conid} | {pvconid} | {typevarid}) +/* + * varid → (small {small | large | digit | ' })⟨reservedid⟩ + * ; N.b. "except {reservedid} is excluded from OpenGrok's varid definition + */ +varid = {small} ({small} | {large} | {digit} | [\'])* +/* + * conid → large {small | large | digit | ' } + */ +conid = {large} ({small} | {large} | {digit} | [\'])* +/* + * polymorphic variant + * pvconid → `large {small | large | digit | ' } + */ +pvconid = [\`] {large} ({small} | {large} | {digit} | [\'])* +/* + * type variable + * typevarid → 'small {small | large | digit } + */ +typevarid = [\'] {small} ({small} | {large} | {digit})* +/* + * small → ascSmall | uniSmall | _ + * ascSmall → a | b | … | z + */ +small = [a-z_] +/* + * large → ascLarge | uniLarge + * ascLarge → A | B | … | Z + */ +large = [A-Z] +/* + * digit → ascDigit | uniDigit + * ascDigit → 0 | 1 | … | 9 + * uniDigit → any Unicode decimal digit + * octit → 0 | 1 | … | 7 + * hexit → digit | A | … | F | a | … | f + */ +digit = [0-9] +octit = [0-7] +hexit = [0-9A-Fa-f] +binit = [0-1] + +Number = ({integer} | {float}) +/* + * decimal → digit{digit} + * octal → octit{octit} + * hexadecimal → hexit{hexit} + */ +decimal = {digit}({digit} | _)* +octal = {octit}({octit} | _)* +hexadecimal = {hexit}({hexit} | _)* +binary = {binit}({binit} | _)* + +/* + * integer → decimal + * | 0o octal | 0O octal + * | 0x hexadecimal | 0X hexadecimal + * | 0b binary | 0B binary + */ +integer = ({decimal} | [0][oO]{octal} | [0][xX]{hexadecimal} | [0][bB]{binary} ) ( l | L | n)? + +/* + * float → decimal . decimal [exponent] + * | decimal exponent + */ +float = ({decimal} [\.] {decimal} {exponent}? | + {decimal} {exponent}) + +/* + * exponent → (e | E) [+ | -] decimal + */ +exponent = [eE] [\+\-]? {decimal} + +/* + * Special treatment of chars is due to type variables with quote + * + * char literal → '\n' | '[^ '\\' '\'' '\010' '\013']' + | escaped_char | dec_code | oct_code | hex_code + */ +Character = ( {newline_char} | {regular_char} | {escaped_char} | + {deccode_char} | {octcode_char} | {hexcode_char}) + +newline_char = \' \n \' +regular_char = \' [^ \\ \' '\010' '\013'] \' +escaped_char = \' \\ [\\ \' \" n t b r ' '] \' +deccode_char = \' \\ {digit}{digit}{digit} \' +octcode_char = \' \\ o {octit}{octit}{octit} \' +hexcode_char = \' \\ x {hexit}{hexit} \' + +/* + * Extension → %attrid | %%attrid | @attrid + */ +lowercase = {varid} +uppercase = {conid} + +attrid = ({lowercase} | {uppercase}) ( [\.] | {lowercase} | {uppercase})* + +Extension = \[ [ ]* @{attrid} | \[ [ ]* @@ {attrid} | \[ [ ]* @@@ {attrid} | + \% {attrid} | \%\% {attrid} + +QuotedStringBegin = \{ {lowercase}* \| +QuotedStringEnd = \| {lowercase}* \} + +/* + * Syntax sugar for extension nodes with quoted strings. + */ +QuotedExtensionBegin = \{ (\% {attrid} | \%\% {attrid}) [ ]* +QuotedExtensionKey = {lowercase}* \| diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex new file mode 100644 index 00000000000..0b3879799f3 --- /dev/null +++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ + +/* + * Get OCaml symbols + */ + +package org.opengrok.indexer.analysis.ocaml; + +import java.io.IOException; +import org.opengrok.indexer.analysis.JFlexSymbolMatcher; + +/** + * @author Yelisey Romanov. Based on work of Harry Pan + */ +%% +%public +%class OCamlSymbolTokenizer +%extends JFlexSymbolMatcher +%unicode +%int +%include ../CommonLexer.lexh +%char +%{ + private int nestedComment; + private String quotedStringKey; + + public void reset() { + super.reset(); + nestedComment = 0; + quotedStringKey = ""; + } +%} + +%state STRING QSTRING QEXTENSIONBEGIN BCOMMENT + +%include ../Common.lexh +%include OCaml.lexh +%% + + { + {Character} {} + {Identifier} { + String id = yytext(); + if (!Consts.kwd.contains(id)) { + onSymbolMatched(id, yychar); + return yystate(); + } + } + {Extension} {} + {Number} {} + \" { yybegin(STRING); } + {QuotedStringBegin} { + String key = yytext(); + quotedStringKey = key.substring(1, key.length() - 1); + yybegin(QSTRING); + } + {QuotedExtensionBegin} { + yypush(QEXTENSIONBEGIN); + } +} + + { + \\[\"\\] {} + \" { yybegin(YYINITIAL); } +} + + { + {QuotedStringEnd} { + String key = yytext(); + if (quotedStringKey.equals( + key.substring(1, key.length() - 1))) { + quotedStringKey = ""; + yybegin(YYINITIAL); + } + } +} + + { + {QuotedExtensionKey} { + String key = yytext(); + quotedStringKey = key.substring(0, key.length() - 1); + yybegin(QSTRING); + } +} + + { + "(*" { + if (nestedComment++ == 0) { + yybegin(BCOMMENT); + } + } +} + + { + "*)" { + if (--nestedComment == 0) { + yybegin(YYINITIAL); + } + } +} + +// fallback +{WhspChar}+ | +[^] {} diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex new file mode 100644 index 00000000000..b3d2159f0a9 --- /dev/null +++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex @@ -0,0 +1,243 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ + +/* + * Cross reference a OCaml file + */ + +package org.opengrok.indexer.analysis.ocaml; + +import java.io.IOException; +import org.opengrok.indexer.analysis.JFlexSymbolMatcher; +import org.opengrok.indexer.web.HtmlConsts; + +/** + * @author Yelisey Romanov + */ +%% +%public +%class OCamlXref +%extends JFlexSymbolMatcher +%unicode +%int +%char +%include ../CommonLexer.lexh +%include ../CommonXref.lexh +%{ + private int nestedComment; + private String quotedStringKey; + + @Override + public void reset() { + super.reset(); + nestedComment = 0; + quotedStringKey = ""; + } + + @Override + public void yypop() throws IOException { + onDisjointSpanChanged(null, yychar); + super.yypop(); + } + + protected void chkLOC() { + switch (yystate()) { + case BCOMMENT: + break; + default: + phLOC(); + break; + } + } +%} + +%state STRING QSTRING QEXTENSIONBEGIN BCOMMENT + +%include ../Common.lexh +%include ../CommonURI.lexh +%include ../CommonPath.lexh +%include OCaml.lexh +%% + { + {Character} { + chkLOC(); + onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); + onNonSymbolMatched(yytext(), yychar); + onDisjointSpanChanged(null, yychar); + } + {Identifier} { + chkLOC(); + String id = yytext(); + onFilteredSymbolMatched(id, yychar, Consts.kwd); + } + {Extension} { + chkLOC(); + onDisjointSpanChanged(HtmlConsts.MACRO_CLASS, yychar); + onNonSymbolMatched(yytext(), yychar); + onDisjointSpanChanged(null, yychar); + } + {Number} { + chkLOC(); + onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar); + onNonSymbolMatched(yytext(), yychar); + onDisjointSpanChanged(null, yychar); + } +} + + { + \\[\"\\] { chkLOC(); onNonSymbolMatched(yytext(), yychar); } + \" { + chkLOC(); + onNonSymbolMatched(yytext(), yychar); + yypop(); + if (nestedComment > 0) { + onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); + } + } + /* + * "A string may include a 'gap'-—two backslants enclosing white + * characters—-which is ignored. This allows one to write long strings on + * more than one line by writing a backslant at the end of one line and at + * the start of the next." N.b. OpenGrok does not explicltly recognize the + * "gap" but since a STRING must end in a non-escaped quotation mark, just + * allow STRINGs to be multi-line regardless of syntax. + */ +} + + { + {QuotedExtensionKey} { + chkLOC(); + yypop(); + yypush(QSTRING); + if (nestedComment > 0) { + onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); + } else { + onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); + } + onNonSymbolMatched(yytext(), yychar); + + String key = yytext(); + quotedStringKey = key.substring(0, key.length() - 1); + } +} + + { + {QuotedStringEnd} { + String key = yytext(); + if (quotedStringKey.equals( + key.substring(1, key.length() - 1))) { + quotedStringKey = ""; + chkLOC(); + onNonSymbolMatched(yytext(), yychar); + yypop(); + if (nestedComment > 0) { + onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); + } + } else { + chkLOC(); + onNonSymbolMatched(yytext(), yychar); + } + } + /* + * "A string may include a 'gap'-—two backslants enclosing white + * characters—-which is ignored. This allows one to write long strings on + * more than one line by writing a backslant at the end of one line and at + * the start of the next." N.b. OpenGrok does not explicitly recognize the + * "gap" but since a STRING must end in a non-escaped quotation mark, just + * allow STRINGs to be multi-line regardless of syntax. + */ +} + + { + "(*" { + if (nestedComment++ == 0) { + yypush(BCOMMENT); + onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); + } + onNonSymbolMatched(yytext(), yychar); + } + \" { + chkLOC(); + yypush(STRING); + if (nestedComment == 0) { + onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); + } + onNonSymbolMatched(yytext(), yychar); + } + {QuotedStringBegin} { + chkLOC(); + yypush(QSTRING); + if (nestedComment == 0) { + onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); + } + onNonSymbolMatched(yytext(), yychar); + + String key = yytext(); + quotedStringKey = key.substring(1, key.length() - 1); + } + {QuotedExtensionBegin} { + chkLOC(); + if (nestedComment == 0) { + onDisjointSpanChanged(HtmlConsts.MACRO_CLASS, yychar); + } + onNonSymbolMatched(yytext(), yychar); + yypush(QEXTENSIONBEGIN); + } +} + + { + "*)" { + onNonSymbolMatched(yytext(), yychar); + if (--nestedComment == 0) { + yypop(); + } + } +} + +{WhspChar}*{EOL} { onEndOfLineMatched(yytext(), yychar); } +[[\s]--[\n]] { onNonSymbolMatched(yytext(), yychar); } +[^\n] { chkLOC(); onNonSymbolMatched(yytext(), yychar); } + + { + {FPath} { + chkLOC(); + onPathlikeMatched(yytext(), '/', false, yychar); + } + {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+ { + chkLOC(); + onEmailAddressMatched(yytext(), yychar); + } +} + + { + {BrowseableURI} { + chkLOC(); + onUriMatched(yytext(), yychar); + } +} + + { + {BrowseableURI} \}? { + onUriMatched(yytext(), yychar); + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlSymbolTokenizerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlSymbolTokenizerTest.java new file mode 100644 index 00000000000..dc52802b6d9 --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlSymbolTokenizerTest.java @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.junit.jupiter.api.Test; +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.JFlexTokenizer; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.LinkedList; +import java.util.List; + +/** + * Tests the {@link OCamlSymbolTokenizer} class. + */ +class OCamlSymbolTokenizerTest { + + private final AbstractAnalyzer analyzer; + + OCamlSymbolTokenizerTest() { + this.analyzer = new OCamlAnalyzerFactory().getAnalyzer(); + } + + private String[] getTermsFor(Reader r) { + List l = new LinkedList<>(); + JFlexTokenizer ts = (JFlexTokenizer) this.analyzer.tokenStream("refs", r); + ts.setReader(r); + CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); + try { + ts.reset(); + while (ts.incrementToken()) { + l.add(term.toString()); + } + } catch (IOException ex) { + throw new RuntimeException(ex); + } + + return l.toArray(new String[0]); + } + + @Test + void sampleTest() throws IOException { + try (InputStream res = getClass().getClassLoader().getResourceAsStream("analysis/ocaml/sample.ml"); + InputStreamReader r = new InputStreamReader(res, StandardCharsets.UTF_8)) { + String[] termsFor = getTermsFor(r); + assertArrayEquals( + new String[] { + "print_string", "again", "print_string", "again", + "'a", "tau", "Tau", "'a", "Phi", "'a", "Omicron", + "weLovePolymorphicVariants", "`Right", "`OrNot", "`OrUnsure", + "weLoveVariablesWithQuotes'", "None", + "failwith", + "Some", "reason", "_is_needed_for", "result", + "reason", "failwith", "result", + "_sum_some_numbers", "Int64", "to_int", + "Nativeint", "to_int", + "Int32", "to_int", + "Int32", "to_int", + "_float_around" + }, + termsFor); + } + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlXrefTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlXrefTest.java new file mode 100644 index 00000000000..1fca21da2f1 --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/ocaml/OCamlXrefTest.java @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2017, 2018, Chris Fraire . + * Portions Copyright (c) 2025, Yelisey Romanov . + */ +package org.opengrok.indexer.analysis.ocaml; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintStream; +import java.io.StringReader; +import java.io.StringWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +import org.junit.jupiter.api.Test; +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.CtagsReader; +import org.opengrok.indexer.analysis.Definitions; +import org.opengrok.indexer.analysis.WriteXrefArgs; +import org.opengrok.indexer.analysis.Xrefer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.opengrok.indexer.util.CustomAssertions.assertLinesEqual; +import static org.opengrok.indexer.util.StreamUtils.copyStream; + +/** + * Tests the {@link OCamlXref} class. + */ +class OCamlXrefTest { + + @Test + void basicTest() throws IOException { + String s = "print_string \"Hello, world!\""; + Writer w = new StringWriter(); + OCamlAnalyzerFactory fac = new OCamlAnalyzerFactory(); + AbstractAnalyzer analyzer = fac.getAnalyzer(); + WriteXrefArgs xargs = new WriteXrefArgs(new StringReader(s), w); + Xrefer xref = analyzer.writeXref(xargs); + assertLinesEqual("OCaml basicTest", + "1" + + "print_string" + + " "Hello, world!"\n", + w.toString()); + assertEquals(1, xref.getLOC(), "OCaml LOC"); + } + + private static int writeOCamlXref(InputStream is, PrintStream os, + Definitions defs) throws IOException { + os.println("" + + "OCaml Xref Test"); + os.println("
");
+        Writer w = new StringWriter();
+        OCamlAnalyzerFactory fac = new OCamlAnalyzerFactory();
+        AbstractAnalyzer analyzer = fac.getAnalyzer();
+        WriteXrefArgs args = new WriteXrefArgs(new InputStreamReader(is, StandardCharsets.UTF_8), w);
+        args.setDefs(defs);
+        Xrefer xref = analyzer.writeXref(args);
+        os.print(w.toString());
+        os.println("
"); + return xref.getLOC(); + } + + @Test + void sampleTest() throws IOException { + // load sample source + InputStream sampleInputStream = getClass().getClassLoader().getResourceAsStream( + "analysis/ocaml/sample.ml"); + ByteArrayOutputStream sampleOutputStream = new ByteArrayOutputStream(); + + Definitions defs = new Definitions(); + defs.addTag(6, "x'y'", "functions", + "let x'y' = let f' = 1; g'h = 2 in f' + g'h", 0, 0); + + int actLOC; + try { + actLOC = writeOCamlXref(sampleInputStream, new PrintStream(sampleOutputStream), defs); + } finally { + sampleInputStream.close(); + sampleOutputStream.close(); + } + + // load expected xref + InputStream expectedInputStream = getClass().getClassLoader().getResourceAsStream( + "analysis/ocaml/sample_xref.html"); + ByteArrayOutputStream expectedOutputSteam = new ByteArrayOutputStream(); + try { + byte[] buffer = new byte[8192]; + int numBytesRead; + do { + numBytesRead = expectedInputStream.read(buffer, 0, buffer.length); + if (numBytesRead > 0) { + expectedOutputSteam.write(buffer, 0, numBytesRead); + } + } while (numBytesRead >= 0); + } finally { + expectedInputStream.close(); + expectedOutputSteam.close(); + } + + String[] actual = new String(sampleOutputStream.toByteArray(), StandardCharsets.UTF_8).split("\\r?\\n"); + String[] expected = new String(expectedOutputSteam.toByteArray(), StandardCharsets.UTF_8).split("\\r?\\n"); + assertLinesEqual("OCaml sampleTest()", expected, actual); + assertEquals(17, actLOC, "OCaml LOC"); + } + + @Test + void sampleTest2() throws IOException { + writeAndCompare("analysis/ocaml/sample2.ml", + "analysis/ocaml/sample2_xref.html", + getTagsDefinitions(), 11); + } + + private void writeAndCompare(String sourceResource, String resultResource, + Definitions defs, int expLOC) throws IOException { + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + InputStream res = getClass().getClassLoader().getResourceAsStream( sourceResource); + assertNotNull(res, sourceResource + " should get-as-stream"); + int actLOC = writeOCamlXref(res, new PrintStream(baos), defs); + res.close(); + + InputStream exp = getClass().getClassLoader().getResourceAsStream( resultResource); + assertNotNull(exp, resultResource + " should get-as-stream"); + byte[] expbytes = copyStream(exp); + exp.close(); + baos.close(); + + String ostr = new String(baos.toByteArray(), StandardCharsets.UTF_8); + String[] gotten = ostr.split("\\r?\\n"); + + String estr = new String(expbytes, StandardCharsets.UTF_8); + String[] expected = estr.split("\n"); + + assertLinesEqual("OCaml xref", expected, gotten); + assertEquals(expLOC, actLOC, "OCaml LOC"); + } + + private Definitions getTagsDefinitions() throws IOException { + InputStream res = getClass().getClassLoader().getResourceAsStream( + "analysis/ocaml/sampletags"); + assertNotNull(res, "though sampletags should stream,"); + + BufferedReader in = new BufferedReader(new InputStreamReader(res, StandardCharsets.UTF_8)); + + CtagsReader rdr = new CtagsReader(); + String line; + while ((line = in.readLine()) != null) { + rdr.readLine(line); + } + return rdr.getDefinitions(); + } +} diff --git a/opengrok-indexer/src/test/resources/analysis/ocaml/sample.ml b/opengrok-indexer/src/test/resources/analysis/ocaml/sample.ml new file mode 100644 index 00000000000..29a00a1e34e --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/ocaml/sample.ml @@ -0,0 +1,30 @@ +(* The sample file. *) +print_string "Hello World!\n";; + +let again = print_string and + string = {bar|Another string|bar} in + (again [@tailcall]) string;; + (* Note, identifier 'string' is deliberately ignored + by tokenizer. *) + +type 'a tau = Tau of 'a | Phi of 'a list | Omicron;; +(* Btw, do you know that + 'a is read as α + 'b is read as ß + 'c is γ - γάμμα ! *) + +let weLovePolymorphicVariants = [`Right ; `OrNot ; `OrUnsure];; + +let weLoveVariablesWithQuotes' = function None -> failwith "???" + | Some reason -> + let _is_needed_for = 8n and + result = reason in + failwith result;; +(* Note: 'result' is not ignored, like 'string' *) +let _sum_some_numbers = Int64.to_int 10_8_8L + + Nativeint.to_int 0xDEADF00Dn + Int32.to_int 0o76l + 0b101 + + Int32.to_int 0b111001l in +();; + +let _float_around = 1.8E+23 +. 1_2_3_4.8_8E-2 in +();; diff --git a/opengrok-indexer/src/test/resources/analysis/ocaml/sample2.ml b/opengrok-indexer/src/test/resources/analysis/ocaml/sample2.ml new file mode 100644 index 00000000000..581af9fdc53 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/ocaml/sample2.ml @@ -0,0 +1,20 @@ +(* Test comments and extension nodes *) + +(* "*)" *) + +let _c = 'c' and + _d = '\78' and + _e = '\o003' and + _f = '\xAf' + +(* {|*)|} *) + +let str = {| (* *) |} + +(* '"' *) + +let _ = [%string {| (* *) |}] + +(* f' '"' *) + +let f = {%string | (* *) |}] diff --git a/opengrok-indexer/src/test/resources/analysis/ocaml/sample2_xref.html b/opengrok-indexer/src/test/resources/analysis/ocaml/sample2_xref.html new file mode 100644 index 00000000000..2a1dc92c989 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/ocaml/sample2_xref.html @@ -0,0 +1,24 @@ +OCaml Xref Test +
+1(* Test comments and extension nodes *)
+2
+3(* "*)" *)
+4
+5let _c = 'c' and
+6    _d = '\78' and
+7    _e = '\o003' and
+8    _f = '\xAf'
+9
+10(* {|*)|} *)
+11
+12let str = {| (* *) |}
+13
+14(* '"' *)
+15
+16let _ = [%string {| (* *) |}]
+17
+18(* f' '"' *)
+19
+20let f = {%string | (* *) |}]
+21
diff --git a/opengrok-indexer/src/test/resources/analysis/ocaml/sample_xref.html b/opengrok-indexer/src/test/resources/analysis/ocaml/sample_xref.html new file mode 100644 index 00000000000..0d8d4971be4 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/ocaml/sample_xref.html @@ -0,0 +1,34 @@ +OCaml Xref Test +
+1(* The sample file. *)
+2print_string "Hello World!\n";;
+3
+4let again = print_string and
+5    string = {bar|Another string|bar} in
+6    (again [@tailcall]) string;;
+7    (* Note, identifier 'string' is deliberately ignored
+8                                                by tokenizer. *)
+9
+10type 'a tau = Tau of 'a | Phi of 'a list | Omicron;;
+11(* Btw, do you know that
+12   'a is read as α
+13   'b is read as ß
+14   'c is γ - γάμμα ! *)
+15
+16let weLovePolymorphicVariants = [`Right ; `OrNot ; `OrUnsure];;
+17
+18let weLoveVariablesWithQuotes' = function None -> failwith "???"
+19                                        | Some reason ->
+20                                                let _is_needed_for = 8n and
+21                                                    result = reason in
+22                                                failwith result;;
+23(* Note: 'result' is not ignored, like 'string' *)
+24let _sum_some_numbers = Int64.to_int 10_8_8L +
+25    Nativeint.to_int 0xDEADF00Dn + Int32.to_int 0o76l + 0b101 +
+26                     Int32.to_int 0b111001l in
+27();;
+28
+29let _float_around = 1.8E+23 +. 1_2_3_4.8_8E-2 in
+30();;
+31
diff --git a/opengrok-indexer/src/test/resources/analysis/ocaml/sampletags b/opengrok-indexer/src/test/resources/analysis/ocaml/sampletags new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/ocaml/sampletags @@ -0,0 +1 @@ +