perf: optimize parsing in Connection API (#3800)

toddlipcon · olavloite · web-flow · commit a2780edb3d9d · 2025-04-11T10:16:25.000+02:00
* Add a simple benchmark for statement parser

Also fixes the pom to run the annotation processor so that
mvn -Pbenchmark actually is able to run benchmarks.

* Optimize AbstractStatementParser.statementStartsWith

I found this was taking ~25% of the CPU of pgadapter when running
the TPCC benchmark loader, which seems to generate very large DMLs.

Previously, it would call split() over the whole string with
a limit. Now, it uses Guava's lazy splitter so that it doesn't
have to copy the remainder of the string following the second
match.

For whatever reason, it seems like the previous implementation
was doing something much more expensive than just copying the
tail. For 100kb long query text, this new implementation is
1600x faster. For short queries it's only a few times faster.

Before:
Benchmark                                Mode  Cnt        Score        Error  Units
StatementParserBenchmark.isQueryTest    thrpt    5  1461962.835 ± 340237.573  ops/s
StatementParserBenchmark.longQueryTest  thrpt    5     2873.150 ±    490.611  ops/s

After:

Benchmark                                Mode  Cnt        Score        Error  Units
StatementParserBenchmark.isQueryTest    thrpt    5  4765215.378 ± 132661.232  ops/s
StatementParserBenchmark.longQueryTest  thrpt    5  4671884.683 ± 486566.506  ops/s

* perf: further micro optimizations to parser

* fix: remove supportsExplain

The `supportsExplain()` method did not actually do anything useful and
returned the wrong result. The reason that it was not useful is that:
1. Parsers that do support the EXPLAIN keyword handle these as client-side statements.
   This means that they never go into the isQuery() method.
2. Parsers that do not support the EXPLAIN keyword cannot do anything with it anyways.

* build: register clirr difference

---------

Co-authored-by: Knut Olav Løite &lt;koloite@gmail.com&gt;
diff --git a/google-cloud-spanner/clirr-ignored-differences.xml b/google-cloud-spanner/clirr-ignored-differences.xml
@@ -927,5 +927,23 @@
     <className>com/google/cloud/spanner/connection/ConnectionOptions</className>
     <field>VALID_PROPERTIES</field>
   </difference>
+
+  <!-- Remove supportsExplain() from the parser -->
+  <difference>
+    <differenceType>7002</differenceType>
+    <className>com/google/cloud/spanner/connection/AbstractStatementParser</className>
+    <method>boolean supportsExplain()</method>
+  </difference>
+  <difference>
+    <differenceType>7002</differenceType>
+    <className>com/google/cloud/spanner/connection/PostgreSQLStatementParser</className>
+    <method>boolean supportsExplain()</method>
+  </difference>
+  <difference>
+    <differenceType>7002</differenceType>
+    <className>com/google/cloud/spanner/connection/SpannerStatementParser</className>
+    <method>boolean supportsExplain()</method>
+  </difference>
+
   
 </differences>
diff --git a/google-cloud-spanner/pom.xml b/google-cloud-spanner/pom.xml
@@ -517,6 +517,19 @@
               </execution>
             </executions>
           </plugin>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-compiler-plugin</artifactId>
+            <configuration>
+              <annotationProcessorPaths>
+                <path>
+                  <groupId>org.openjdk.jmh</groupId>
+                  <artifactId>jmh-generator-annprocess</artifactId>
+                  <version>1.37</version>
+                </path>
+              </annotationProcessorPaths>
+            </configuration>
+          </plugin>
         </plugins>
       </build>
     </profile>
diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/Statement.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/Statement.java
@@ -21,6 +21,7 @@
 
 import com.google.cloud.spanner.ReadContext.QueryAnalyzeMode;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableMap;
 import com.google.spanner.v1.ExecuteSqlRequest.QueryOptions;
 import java.io.Serializable;
 import java.util.Collections;
@@ -140,7 +141,7 @@ Builder handle(Value value) {
 
   /** Creates a {@code Statement} with the given SQL text {@code sql}. */
   public static Statement of(String sql) {
-    return newBuilder(sql).build();
+    return new Statement(sql, ImmutableMap.of(), /*queryOptions=*/ null);
   }
 
   /** Creates a new statement builder with the SQL text {@code sql}. */
diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/AbstractStatementParser.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/AbstractStatementParser.java
@@ -31,6 +31,7 @@
 import com.google.cloud.spanner.connection.UnitOfWork.CallType;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Splitter;
 import com.google.common.cache.Cache;
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheStats;
@@ -41,6 +42,7 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
@@ -511,7 +513,7 @@ ParsedStatement parse(Statement statement, QueryOptions defaultQueryOptions) {
     return parsedStatement.copy(statement, defaultQueryOptions);
   }
 
-  private ParsedStatement internalParse(Statement statement, QueryOptions defaultQueryOptions) {
+  ParsedStatement internalParse(Statement statement, QueryOptions defaultQueryOptions) {
     StatementHintParser statementHintParser =
         new StatementHintParser(getDialect(), statement.getSql());
     ReadQueryUpdateTransactionOption[] optionsFromHints = EMPTY_OPTIONS;
@@ -521,16 +523,21 @@ private ParsedStatement internalParse(Statement statement, QueryOptions defaultQ
           statement.toBuilder().replace(statementHintParser.getSqlWithoutClientSideHints()).build();
       optionsFromHints = convertHintsToOptions(statementHintParser.getClientSideStatementHints());
     }
+    // TODO: Qualify statements without removing comments first.
     String sql = removeCommentsAndTrim(statement.getSql());
     ClientSideStatementImpl client = parseClientSideStatement(sql);
     if (client != null) {
       return ParsedStatement.clientSideStatement(client, statement, sql);
-    } else if (isQuery(sql)) {
-      return ParsedStatement.query(statement, sql, defaultQueryOptions, optionsFromHints);
-    } else if (isUpdateStatement(sql)) {
-      return ParsedStatement.update(statement, sql, checkReturningClause(sql), optionsFromHints);
-    } else if (isDdlStatement(sql)) {
-      return ParsedStatement.ddl(statement, sql);
+    } else {
+      String sqlWithoutHints =
+          !sql.isEmpty() && sql.charAt(0) == '@' ? removeStatementHint(sql) : sql;
+      if (isQuery(sqlWithoutHints)) {
+        return ParsedStatement.query(statement, sql, defaultQueryOptions, optionsFromHints);
+      } else if (isUpdateStatement(sqlWithoutHints)) {
+        return ParsedStatement.update(statement, sql, checkReturningClause(sql), optionsFromHints);
+      } else if (isDdlStatement(sqlWithoutHints)) {
+        return ParsedStatement.ddl(statement, sql);
+      }
     }
     return ParsedStatement.unknown(statement, sql);
   }
@@ -610,20 +617,16 @@ public boolean isUpdateStatement(String sql) {
     return statementStartsWith(sql, dmlStatements);
   }
 
-  protected abstract boolean supportsExplain();
-
   private boolean statementStartsWith(String sql, Iterable<String> checkStatements) {
     Preconditions.checkNotNull(sql);
-    String[] tokens = sql.split("\\s+", 2);
-    int checkIndex = 0;
-    if (supportsExplain() && tokens[0].equalsIgnoreCase("EXPLAIN")) {
-      checkIndex = 1;
-    }
-    if (tokens.length > checkIndex) {
-      for (String check : checkStatements) {
-        if (tokens[checkIndex].equalsIgnoreCase(check)) {
-          return true;
-        }
+    Iterator<String> tokens = Splitter.onPattern("\\s+").split(sql).iterator();
+    if (!tokens.hasNext()) {
+      return false;
+    }
+    String token = tokens.next();
+    for (String check : checkStatements) {
+      if (token.equalsIgnoreCase(check)) {
+        return true;
       }
     }
     return false;
@@ -929,7 +932,8 @@ int skipQuoted(
       appendIfNotNull(result, startQuote);
       appendIfNotNull(result, startQuote);
     }
-    while (currentIndex < sql.length()) {
+    int length = sql.length();
+    while (currentIndex < length) {
       char currentChar = sql.charAt(currentIndex);
       if (currentChar == startQuote) {
         if (supportsDollarQuotedStrings() && currentChar == DOLLAR) {
@@ -940,7 +944,7 @@ int skipQuoted(
             return currentIndex + tag.length() + 2;
           }
         } else if (supportsEscapeQuoteWithQuote()
-            && sql.length() > currentIndex + 1
+            && length > currentIndex + 1
             && sql.charAt(currentIndex + 1) == startQuote) {
           // This is an escaped quote (e.g. 'foo''bar')
           appendIfNotNull(result, currentChar);
@@ -949,7 +953,7 @@ int skipQuoted(
           continue;
         } else if (isTripleQuoted) {
           // Check if this is the end of the triple-quoted string.
-          if (sql.length() > currentIndex + 2
+          if (length > currentIndex + 2
               && sql.charAt(currentIndex + 1) == startQuote
               && sql.charAt(currentIndex + 2) == startQuote) {
             appendIfNotNull(result, currentChar);
@@ -963,7 +967,7 @@ int skipQuoted(
         }
       } else if (supportsBackslashEscape()
           && currentChar == BACKSLASH
-          && sql.length() > currentIndex + 1
+          && length > currentIndex + 1
           && sql.charAt(currentIndex + 1) == startQuote) {
         // This is an escaped quote (e.g. 'foo\'bar').
         // Note that in raw strings, the \ officially does not start an escape sequence, but the
diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/PostgreSQLStatementParser.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/PostgreSQLStatementParser.java
@@ -46,15 +46,6 @@ Dialect getDialect() {
     return Dialect.POSTGRESQL;
   }
 
-  /**
-   * Indicates whether the parser supports the {@code EXPLAIN} clause. The PostgreSQL parser does
-   * not support it.
-   */
-  @Override
-  protected boolean supportsExplain() {
-    return false;
-  }
-
   @Override
   boolean supportsNestedComments() {
     return true;
@@ -125,7 +116,8 @@ String removeCommentsAndTrimInternal(String sql) {
     int multiLineCommentStartIdx = -1;
     StringBuilder res = new StringBuilder(sql.length());
     int index = 0;
-    while (index < sql.length()) {
+    int length = sql.length();
+    while (index < length) {
       char c = sql.charAt(index);
       if (isInSingleLineComment) {
         if (c == '\n') {
@@ -134,34 +126,34 @@ String removeCommentsAndTrimInternal(String sql) {
           res.append(c);
         }
       } else if (multiLineCommentLevel > 0) {
-        if (sql.length() > index + 1 && c == ASTERISK && sql.charAt(index + 1) == SLASH) {
+        if (length > index + 1 && c == ASTERISK && sql.charAt(index + 1) == SLASH) {
           multiLineCommentLevel--;
           if (multiLineCommentLevel == 0) {
-            if (!whitespaceBeforeOrAfterMultiLineComment && (sql.length() > index + 2)) {
+            if (!whitespaceBeforeOrAfterMultiLineComment && (length > index + 2)) {
               whitespaceBeforeOrAfterMultiLineComment =
                   Character.isWhitespace(sql.charAt(index + 2));
             }
             // If the multiline comment does not have any whitespace before or after it, and it is
             // neither at the start nor at the end of SQL string, append an extra space.
             if (!whitespaceBeforeOrAfterMultiLineComment
                 && (multiLineCommentStartIdx != 0)
-                && (index != sql.length() - 2)) {
+                && (index != length - 2)) {
               res.append(' ');
             }
           }
           index++;
-        } else if (sql.length() > index + 1 && c == SLASH && sql.charAt(index + 1) == ASTERISK) {
+        } else if (length > index + 1 && c == SLASH && sql.charAt(index + 1) == ASTERISK) {
           multiLineCommentLevel++;
           index++;
         }
       } else {
         // Check for -- which indicates the start of a single-line comment.
-        if (sql.length() > index + 1 && c == HYPHEN && sql.charAt(index + 1) == HYPHEN) {
+        if (length > index + 1 && c == HYPHEN && sql.charAt(index + 1) == HYPHEN) {
           // This is a single line comment.
           isInSingleLineComment = true;
           index += 2;
           continue;
-        } else if (sql.length() > index + 1 && c == SLASH && sql.charAt(index + 1) == ASTERISK) {
+        } else if (length > index + 1 && c == SLASH && sql.charAt(index + 1) == ASTERISK) {
           multiLineCommentLevel++;
           if (index >= 1) {
             whitespaceBeforeOrAfterMultiLineComment = Character.isWhitespace(sql.charAt(index - 1));
diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/SpannerStatementParser.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/connection/SpannerStatementParser.java
@@ -46,15 +46,6 @@ Dialect getDialect() {
     return Dialect.GOOGLE_STANDARD_SQL;
   }
 
-  /**
-   * Indicates whether the parser supports the {@code EXPLAIN} clause. The Spanner parser does
-   * support it.
-   */
-  @Override
-  protected boolean supportsExplain() {
-    return true;
-  }
-
   @Override
   boolean supportsNestedComments() {
     return false;
diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/connection/StatementParserBenchmark.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/connection/StatementParserBenchmark.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.cloud.spanner.connection;
+
+import com.google.cloud.spanner.Dialect;
+import com.google.cloud.spanner.Statement;
+import com.google.cloud.spanner.connection.AbstractStatementParser.ParsedStatement;
+import com.google.spanner.v1.ExecuteSqlRequest.QueryOptions;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Warmup;
+
+@Fork(value = 1, warmups = 0)
+@Warmup(iterations = 1, time = 5)
+@Measurement(iterations = 5, time = 5)
+public class StatementParserBenchmark {
+  private static final Dialect dialect = Dialect.POSTGRESQL;
+  private static final AbstractStatementParser PARSER =
+      AbstractStatementParser.getInstance(dialect);
+
+  private static final String LONG_QUERY_TEXT =
+      generateLongStatement("SELECT * FROM foo WHERE 1", 100 * 1024); // 100kb
+
+  private static final String LONG_DML_TEXT =
+      generateLongStatement("update foo set bar=1 WHERE 1", 100 * 1024); // 100kb
+
+  /** Generates a long SQL-looking string. */
+  private static String generateLongStatement(String prefix, int length) {
+    StringBuilder sb = new StringBuilder(length + 50);
+    sb.append(prefix);
+    while (sb.length() < length) {
+      sb.append(" OR abcdefghijklmnopqrstuvwxyz='abcdefghijklmnopqrstuvwxyz'");
+    }
+    return sb.toString();
+  }
+
+  @Benchmark
+  public ParsedStatement isQueryTest() {
+    return PARSER.internalParse(
+        Statement.of("CREATE TABLE FOO (ID INT64, NAME STRING(100)) PRIMARY KEY (ID)"),
+        QueryOptions.getDefaultInstance());
+  }
+
+  @Benchmark
+  public ParsedStatement longQueryTest() {
+    return PARSER.internalParse(Statement.of(LONG_QUERY_TEXT), QueryOptions.getDefaultInstance());
+  }
+
+  @Benchmark
+  public ParsedStatement longDmlTest() {
+    return PARSER.internalParse(Statement.of(LONG_DML_TEXT), QueryOptions.getDefaultInstance());
+  }
+
+  public static void main(String[] args) throws Exception {
+    for (int i = 0; i < 100000; i++) {
+      if (PARSER.internalParse(Statement.of(LONG_QUERY_TEXT), QueryOptions.getDefaultInstance())
+          == null) {
+        throw new AssertionError();
+      }
+      if (PARSER.internalParse(Statement.of(LONG_DML_TEXT), QueryOptions.getDefaultInstance())
+          == null) {
+        throw new AssertionError();
+      }
+    }
+  }
+}