wip: entities parsing in browser

vuejs · yyx990803 · Nov 25, 2023 · Nov 12, 2023 · Nov 12, 2023 · Nov 12, 2023
commit 1021e335b89b7c157638e5983b8fc0d636b25e38
diff --git a/packages/compiler-core/src/options.ts b/packages/compiler-core/src/options.ts
@@ -50,7 +50,8 @@ export interface ParserOptions
    */
   whitespace?: 'preserve' | 'condense'
   /**
-   * Only needed for DOM compilers
+   * Only used for DOM compilers that runs in the browser.
+   * In non-browser builds, this option is ignored.
    */
   decodeEntities?: (rawText: string, asAttr: boolean) => string
   /**

diff --git a/packages/compiler-core/src/parser/Tokenizer.ts b/packages/compiler-core/src/parser/Tokenizer.ts
@@ -22,12 +22,20 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
  */
 
+import { ElementNode, Position } from '../ast'
+
+/**
+ * Note: entities is a non-browser-build-only dependency.
+ * In the browser, we use an HTML element to do the decoding.
+ * Make sure all imports from entities are only used in non-browser branches
+ * so that it can be properly treeshaken.
+ */
 import {
   EntityDecoder,
   DecodingMode,
-  htmlDecodeTree
+  htmlDecodeTree,
+  fromCodePoint
 } from 'entities/lib/decode.js'
-import { ElementNode, Position } from '../ast'
 
 export const enum ParseMode {
   BASE,
@@ -170,7 +178,7 @@ export enum QuoteType {
 
 export interface Callbacks {
   ontext(start: number, endIndex: number): void
-  ontextentity(codepoint: number, endIndex: number): void
+  ontextentity(char: string, endIndex: number): void
 
   oninterpolation(start: number, endIndex: number): void
 
@@ -180,7 +188,7 @@ export interface Callbacks {
   onclosetag(start: number, endIndex: number): void
 
   onattribdata(start: number, endIndex: number): void
-  onattribentity(codepoint: number): void
+  onattribentity(char: string): void
   onattribend(quote: QuoteType, endIndex: number): void
   onattribname(start: number, endIndex: number): void
   onattribnameend(endIndex: number): void
@@ -233,15 +241,17 @@ export default class Tokenizer {
   /** Reocrd newline positions for fast line / column calculation */
   private newlines: number[] = []
 
-  private readonly entityDecoder: EntityDecoder
+  private readonly entityDecoder?: EntityDecoder
 
   constructor(
     private readonly stack: ElementNode[],
     private readonly cbs: Callbacks
   ) {
-    this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
-      this.emitCodePoint(cp, consumed)
-    )
+    if (!__BROWSER__) {
+      this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
+        this.emitCodePoint(cp, consumed)
+      )
+    }
   }
 
   public mode = ParseMode.BASE
@@ -290,7 +300,7 @@ export default class Tokenizer {
       }
       this.state = State.BeforeTagName
       this.sectionStart = this.index
-    } else if (c === CharCodes.Amp) {
+    } else if (!__BROWSER__ && c === CharCodes.Amp) {
       this.startEntity()
     } else if (c === this.delimiterOpen[0]) {
       this.state = State.InterpolationOpen
@@ -398,7 +408,7 @@ export default class Tokenizer {
           !(this.mode === ParseMode.SFC && this.stack.length === 0))
       ) {
         // We have to parse entities in <title> and <textarea> tags.
-        if (c === CharCodes.Amp) {
+        if (!__BROWSER__ && c === CharCodes.Amp) {
           this.startEntity()
         }
       } else if (this.fastForwardTo(CharCodes.Lt)) {
@@ -702,15 +712,15 @@ export default class Tokenizer {
     }
   }
   private handleInAttributeValue(c: number, quote: number) {
-    if (c === quote) {
+    if (c === quote || (__BROWSER__ && this.fastForwardTo(quote))) {
       this.cbs.onattribdata(this.sectionStart, this.index)
       this.sectionStart = -1
       this.cbs.onattribend(
         quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
         this.index + 1
       )
       this.state = State.BeforeAttributeName
-    } else if (c === CharCodes.Amp) {
+    } else if (!__BROWSER__ && c === CharCodes.Amp) {
       this.startEntity()
     }
   }
@@ -727,7 +737,7 @@ export default class Tokenizer {
       this.cbs.onattribend(QuoteType.Unquoted, this.index)
       this.state = State.BeforeAttributeName
       this.stateBeforeAttributeName(c)
-    } else if (c === CharCodes.Amp) {
+    } else if (!__BROWSER__ && c === CharCodes.Amp) {
       this.startEntity()
     }
   }
@@ -796,29 +806,33 @@ export default class Tokenizer {
   }
 
   private startEntity() {
-    this.baseState = this.state
-    this.state = State.InEntity
-    this.entityStart = this.index
-    this.entityDecoder.startEntity(
-      this.baseState === State.Text || this.baseState === State.InSpecialTag
-        ? DecodingMode.Legacy
-        : DecodingMode.Attribute
-    )
+    if (!__BROWSER__) {
+      this.baseState = this.state
+      this.state = State.InEntity
+      this.entityStart = this.index
+      this.entityDecoder!.startEntity(
+        this.baseState === State.Text || this.baseState === State.InSpecialTag
+          ? DecodingMode.Legacy
+          : DecodingMode.Attribute
+      )
+    }
   }
 
   private stateInEntity(): void {
-    const length = this.entityDecoder.write(this.buffer, this.index)
+    if (!__BROWSER__) {
+      const length = this.entityDecoder!.write(this.buffer, this.index)
 
-    // If `length` is positive, we are done with the entity.
-    if (length >= 0) {
-      this.state = this.baseState
+      // If `length` is positive, we are done with the entity.
+      if (length >= 0) {
+        this.state = this.baseState
 
-      if (length === 0) {
-        this.index = this.entityStart
+        if (length === 0) {
+          this.index = this.entityStart
+        }
+      } else {
+        // Mark buffer as consumed.
+        this.index = this.buffer.length - 1
       }
-    } else {
-      // Mark buffer as consumed.
-      this.index = this.buffer.length - 1
     }
   }
 
@@ -1002,8 +1016,8 @@ export default class Tokenizer {
   }
 
   private finish() {
-    if (this.state === State.InEntity) {
-      this.entityDecoder.end()
+    if (!__BROWSER__ && this.state === State.InEntity) {
+      this.entityDecoder!.end()
       this.state = this.baseState
     }
 
@@ -1052,25 +1066,27 @@ export default class Tokenizer {
   }
 
   private emitCodePoint(cp: number, consumed: number): void {
-    if (
-      this.baseState !== State.Text &&
-      this.baseState !== State.InSpecialTag
-    ) {
-      if (this.sectionStart < this.entityStart) {
-        this.cbs.onattribdata(this.sectionStart, this.entityStart)
-      }
-      this.sectionStart = this.entityStart + consumed
-      this.index = this.sectionStart - 1
+    if (!__BROWSER__) {
+      if (
+        this.baseState !== State.Text &&
+        this.baseState !== State.InSpecialTag
+      ) {
+        if (this.sectionStart < this.entityStart) {
+          this.cbs.onattribdata(this.sectionStart, this.entityStart)
+        }
+        this.sectionStart = this.entityStart + consumed
+        this.index = this.sectionStart - 1
 
-      this.cbs.onattribentity(cp)
-    } else {
-      if (this.sectionStart < this.entityStart) {
-        this.cbs.ontext(this.sectionStart, this.entityStart)
-      }
-      this.sectionStart = this.entityStart + consumed
-      this.index = this.sectionStart - 1
+        this.cbs.onattribentity(fromCodePoint(cp))
+      } else {
+        if (this.sectionStart < this.entityStart) {
+          this.cbs.ontext(this.sectionStart, this.entityStart)
+        }
+        this.sectionStart = this.entityStart + consumed
+        this.index = this.sectionStart - 1
 
-      this.cbs.ontextentity(cp, this.sectionStart)
+        this.cbs.ontextentity(fromCodePoint(cp), this.sectionStart)
+      }
     }
   }
 }
diff --git a/packages/compiler-core/src/parser/index.ts b/packages/compiler-core/src/parser/index.ts
@@ -1,4 +1,3 @@
-import { fromCodePoint } from 'entities/lib/decode.js'
 import {
   AttributeNode,
   ConstantTypes,
@@ -29,6 +28,7 @@ import { defaultOnError, defaultOnWarn } from '../errors'
 import { forAliasRE, isCoreComponent } from '../utils'
 
 type OptionalOptions =
+  | 'decodeEntities'
   | 'whitespace'
   | 'isNativeTag'
   | 'isBuiltInComponent'
@@ -37,28 +37,13 @@ type OptionalOptions =
 type MergedParserOptions = Omit<Required<ParserOptions>, OptionalOptions> &
   Pick<ParserOptions, OptionalOptions>
 
-// The default decoder only provides escapes for characters reserved as part of
-// the template syntax, and is only used if the custom renderer did not provide
-// a platform-specific decoder.
-const decodeRE = /&(gt|lt|amp|apos|quot);/g
-const decodeMap: Record<string, string> = {
-  gt: '>',
-  lt: '<',
-  amp: '&',
-  apos: "'",
-  quot: '"'
-}
-
 export const defaultParserOptions: MergedParserOptions = {
   parseMode: 'base',
   delimiters: [`{{`, `}}`],
   getNamespace: () => Namespaces.HTML,
   isVoidTag: NO,
   isPreTag: NO,
   isCustomElement: NO,
-  // TODO handle entities
-  decodeEntities: (rawText: string): string =>
-    rawText.replace(decodeRE, (_, p1) => decodeMap[p1]),
   onError: defaultOnError,
   onWarn: defaultOnWarn,
   comments: __DEV__
@@ -84,8 +69,8 @@ const tokenizer = new Tokenizer(stack, {
     onText(getSlice(start, end), start, end)
   },
 
-  ontextentity(cp, end) {
-    onText(fromCodePoint(cp), end - 1, end)
+  ontextentity(char, end) {
+    onText(char, end - 1, end)
   },
 
   oninterpolation(start, end) {
@@ -242,8 +227,8 @@ const tokenizer = new Tokenizer(stack, {
     currentAttrEndIndex = end
   },
 
-  onattribentity(codepoint) {
-    currentAttrValue += fromCodePoint(codepoint)
+  onattribentity(char) {
+    currentAttrValue += char
   },
 
   onattribnameend(end) {
@@ -265,6 +250,13 @@ const tokenizer = new Tokenizer(stack, {
   onattribend(quote, end) {
     if (currentElement && currentProp) {
       if (quote !== QuoteType.NoValue) {
+        if (__BROWSER__ && currentAttrValue.includes('&')) {
+          // TODO should not do this in <script> or <style>
+          currentAttrValue = currentOptions.decodeEntities!(
+            currentAttrValue,
+            true
+          )
+        }
         if (currentProp.type === NodeTypes.ATTRIBUTE) {
           // assign value
 
@@ -422,6 +414,10 @@ function closeCurrentTag(end: number) {
 }
 
 function onText(content: string, start: number, end: number) {
+  if (__BROWSER__ && content.includes('&')) {
+    // TODO do not do this in <script> or <style>
+    content = currentOptions.decodeEntities!(content, false)
+  }
   const parent = getParent()
   const lastNode = parent.children[parent.children.length - 1]
   if (lastNode?.type === NodeTypes.TEXT) {
@@ -697,6 +693,19 @@ export function baseParse(input: string, options?: ParserOptions): RootNode {
   currentInput = input
   currentOptions = extend({}, defaultParserOptions, options)
 
+  if (__DEV__) {
+    if (!__BROWSER__ && currentOptions.decodeEntities) {
+      console.warn(
+        `[@vue/compiler-core] decodeEntities option is passed but will be ` +
+          `ignored in non-browser builds.`
+      )
+    } else if (__BROWSER__ && !currentOptions.decodeEntities) {
+      throw new Error(
+        `[@vue/compiler-core] decodeEntities option is required in browser builds.`
+      )
+    }
+  }
+
   tokenizer.mode =
     currentOptions.parseMode === 'html'
       ? ParseMode.HTML