33namespace Keepsuit \Liquid \Parse ;
44
55use Keepsuit \Liquid \Exceptions \SyntaxException ;
6+ use Keepsuit \Liquid \TagBlock ;
67use RuntimeException ;
78
89class Lexer
@@ -30,12 +31,17 @@ class Lexer
3031 protected array $ tokens ;
3132
3233 /**
33- * @var array<int, array<int, array {0:string,1:int}> >
34+ * @var array<int, array{0:string,1:int}>
3435 */
3536 protected array $ positions ;
3637
3738 protected int $ position ;
3839
40+ /**
41+ * @var string[]
42+ */
43+ protected array $ rawBodyTags ;
44+
3945 public function __construct (
4046 protected ParseContext $ parseContext ,
4147 ) {}
@@ -53,10 +59,17 @@ public function tokenize(string $source): TokenStream
5359 $ this ->state = LexerState::Data;
5460 $ this ->tokens = [];
5561
62+ $ this ->rawBodyTags = array_keys (array_filter ($ this ->parseContext ->environment ->tagRegistry ->all (), function ($ tag ) {
63+ if (! is_subclass_of ($ tag , TagBlock::class)) {
64+ return false ;
65+ }
66+
67+ return $ tag ::hasRawBody ();
68+ }));
69+
5670 $ this ->parseContext ->lineNumber = 1 ;
5771
58- preg_match_all (LexerOptions::tokenStartRegex (), $ this ->source , $ matches , PREG_OFFSET_CAPTURE );
59- $ this ->positions = $ matches ;
72+ $ this ->positions = $ this ->extractTokenStarts ($ this ->source );
6073 $ this ->position = -1 ;
6174
6275 while ($ this ->cursor < $ this ->end ) {
@@ -79,42 +92,36 @@ public function tokenize(string $source): TokenStream
7992 protected function lexData (): void
8093 {
8194 // if no matches are left we return the rest of the template as simple text token
82- if ($ this ->position == count ($ this ->positions [ 0 ] ) - 1 ) {
95+ if ($ this ->position == count ($ this ->positions ) - 1 ) {
8396 $ this ->pushToken (TokenType::TextData, substr ($ this ->source , $ this ->cursor ));
8497 $ this ->cursor = $ this ->end ;
8598
8699 return ;
87100 }
88101
89102 // Find the first token after the current cursor
90- $ position = $ this ->positions [0 ][ ++$ this ->position ];
103+ $ position = $ this ->positions [++$ this ->position ];
91104 while ($ position [1 ] < $ this ->cursor ) {
92- if ($ this ->position == count ($ this ->positions [ 0 ] ) - 1 ) {
105+ if ($ this ->position == count ($ this ->positions ) - 1 ) {
93106 return ;
94107 }
95- $ position = $ this ->positions [0 ][ ++$ this ->position ];
108+ $ position = $ this ->positions [++$ this ->position ];
96109 }
97110
98111 // push the template text before the token first
99112 $ text = $ textBeforeToken = substr ($ this ->source , $ this ->cursor , $ position [1 ] - $ this ->cursor );
100113
101114 // trim?
102- if ($ this ->positions [2 ][ $ this ->position ][0 ] === LexerOptions::WhitespaceTrim->value ) {
115+ if (( $ this ->positions [$ this ->position ][0 ][ 2 ] ?? null ) === LexerOptions::WhitespaceTrim->value ) {
103116 $ textBeforeToken = rtrim ($ textBeforeToken );
104117 }
105118
106119 $ this ->pushToken (TokenType::TextData, $ textBeforeToken );
107120 $ this ->moveCursor ($ text .$ position [0 ]);
108121
109- switch ($ this ->positions [1 ][ $ this ->position ][0 ]) {
122+ switch ($ this ->positions [$ this ->position ][0 ]) {
110123 case LexerOptions::TagBlockStart->value :
111- // {% raw %}
112- if (preg_match (LexerOptions::blockRawStartRegex (), $ this ->source , $ matches , offset: $ this ->cursor ) === 1 ) {
113- $ this ->moveCursor ($ matches [0 ]);
114- $ this ->lexRawData ();
115- break ;
116- }
117-
124+ case LexerOptions::TagBlockStart->value .LexerOptions::WhitespaceTrim->value :
118125 // {% comment %}
119126 if (preg_match (LexerOptions::blockCommentStartRegex (), $ this ->source , $ matches , offset: $ this ->cursor ) === 1 ) {
120127 $ this ->moveCursor ($ matches [0 ]);
@@ -127,6 +134,7 @@ protected function lexData(): void
127134 $ this ->currentVarBlockLine = $ this ->lineNumber ;
128135 break ;
129136 case LexerOptions::TagVariableStart->value :
137+ case LexerOptions::TagVariableStart->value .LexerOptions::WhitespaceTrim->value :
130138 $ this ->pushToken (TokenType::VariableStart);
131139 $ this ->pushState (LexerState::Variable);
132140 $ this ->currentVarBlockLine = $ this ->lineNumber ;
@@ -145,9 +153,8 @@ protected function lexVariable(): void
145153 $ this ->popState ();
146154
147155 // trim?
148- if (trim ($ matches [0 ])[0 ] === LexerOptions::WhitespaceTrim->value ) {
149- preg_match ('/\s+/A ' , $ this ->source , $ matches , offset: $ this ->cursor );
150- $ this ->moveCursor ($ matches [0 ] ?? '' );
156+ if ($ matches [1 ][0 ] === LexerOptions::WhitespaceTrim->value ) {
157+ $ this ->trimWhitespaces ();
151158 }
152159 } else {
153160 $ this ->lexExpression ();
@@ -159,18 +166,40 @@ protected function lexVariable(): void
159166 */
160167 protected function lexBlock (): void
161168 {
162- if (preg_match (LexerOptions::blockEndRegex (), $ this ->source , $ matches , offset: $ this ->cursor ) === 1 ) {
163- $ this ->pushToken (TokenType::BlockEnd);
164- $ this ->moveCursor ($ matches [0 ]);
165- $ this ->popState ();
169+ $ tag = null ;
166170
167- // trim?
168- if (trim ($ matches [0 ])[0 ] === LexerOptions::WhitespaceTrim->value ) {
169- preg_match ('/\s+/A ' , $ this ->source , $ matches , offset: $ this ->cursor );
170- $ this ->moveCursor ($ matches [0 ] ?? '' );
171+ // Parse the full expression inside {% ... %}
172+ while (preg_match (LexerOptions::blockEndRegex (), $ this ->source , $ matches , offset: $ this ->cursor ) !== 1 ) {
173+ $ this ->lexExpression ();
174+
175+ $ lastToken = $ this ->tokens [array_key_last ($ this ->tokens )];
176+
177+ if ($ tag === null && $ lastToken ->type === TokenType::Identifier) {
178+ $ tag = $ lastToken ;
171179 }
180+ }
181+
182+ // Move the cursor to the end of the block
183+ $ this ->moveCursor ($ matches [0 ]);
184+
185+ // trim?
186+ if ($ matches [1 ][0 ] === LexerOptions::WhitespaceTrim->value ) {
187+ $ this ->trimWhitespaces ();
188+ }
189+
190+ // If the last token is a block start, we remove the node
191+ $ lastToken = $ this ->tokens [array_key_last ($ this ->tokens )];
192+ if ($ lastToken ->type === TokenType::BlockStart) {
193+ array_pop ($ this ->tokens );
172194 } else {
173- $ this ->lexExpression ();
195+ $ this ->pushToken (TokenType::BlockEnd);
196+ }
197+
198+ $ this ->popState ();
199+
200+ // If the tag is a raw body tag, we need to lex the body as raw data instead of liquid blocks
201+ if ($ tag !== null && in_array ($ tag ->data , $ this ->rawBodyTags , true )) {
202+ $ this ->laxRawBodyTag ($ tag ->data );
174203 }
175204 }
176205
@@ -227,23 +256,27 @@ protected function ensureStreamNotEnded(): void
227256 }
228257 }
229258
230- protected function lexRawData ( ): void
259+ protected function laxRawBodyTag ( string $ tag ): void
231260 {
232- if (preg_match (LexerOptions::blockRawDataRegex ( ), $ this ->source , $ matches , flags: PREG_OFFSET_CAPTURE , offset: $ this ->cursor ) !== 1 ) {
233- throw SyntaxException::tagBlockNeverClosed (' raw ' );
261+ if (preg_match (LexerOptions::blockRawBodyTagDataRegex ( $ tag ), $ this ->source , $ matches , flags: PREG_OFFSET_CAPTURE , offset: $ this ->cursor ) !== 1 ) {
262+ throw SyntaxException::tagBlockNeverClosed ($ tag );
234263 }
235264
236- $ text = substr ($ this ->source , $ this ->cursor , $ matches [0 ][1 ] - $ this ->cursor );
265+ $ rawBody = substr ($ this ->source , $ this ->cursor , $ matches [0 ][1 ] - $ this ->cursor );
237266
238- $ this ->moveCursor ($ text . $ matches [ 0 ][ 0 ] );
267+ $ this ->moveCursor ($ rawBody );
239268
240- // trim?
241- if (isset ($ matches [2 ][0 ])) {
242- preg_match ('/\s+/A ' , $ this ->source , $ matches2 , offset: $ this ->cursor );
243- $ this ->moveCursor ($ matches2 [0 ] ?? '' );
269+ // inner trim?
270+ if (($ matches [1 ][0 ][2 ] ?? null ) === LexerOptions::WhitespaceTrim->value ) {
271+ $ rawBody = rtrim ($ rawBody );
244272 }
245273
246- $ this ->pushToken (TokenType::RawData, $ text );
274+ $ this ->pushToken (TokenType::RawData, $ rawBody );
275+
276+ // trim?
277+ if ($ matches [2 ][0 ][0 ] === LexerOptions::WhitespaceTrim->value ) {
278+ $ this ->trimWhitespaces ();
279+ }
247280 }
248281
249282 protected function lexComment (): void
@@ -265,24 +298,7 @@ protected function lexInlineComment(): void
265298
266299 $ text = substr ($ this ->source , $ this ->cursor , $ matches [0 ][1 ] - $ this ->cursor );
267300
268- $ this ->moveCursor ($ text .$ matches [0 ][0 ]);
269-
270- if ($ matches [1 ][0 ] === "\n" ) {
271- return ;
272- }
273-
274- $ lastToken = $ this ->tokens [count ($ this ->tokens ) - 1 ] ?? null ;
275-
276- if ($ lastToken ?->type === TokenType::BlockStart) {
277- array_pop ($ this ->tokens );
278- } else {
279- $ this ->pushToken (TokenType::BlockEnd);
280- }
281-
282- if ($ matches [1 ][0 ] === LexerOptions::WhitespaceTrim->value ) {
283- preg_match ('/\s+/A ' , $ this ->source , $ matches2 , offset: $ this ->cursor );
284- $ this ->moveCursor ($ matches2 [0 ] ?? '' );
285- }
301+ $ this ->moveCursor ($ text );
286302 }
287303
288304 protected function pushToken (TokenType $ type , string $ value = '' ): void
@@ -322,4 +338,24 @@ protected function popState(): void
322338
323339 $ this ->state = $ state ;
324340 }
341+
342+ protected function trimWhitespaces (): void
343+ {
344+ preg_match ('/\s+/A ' , $ this ->source , $ matches , offset: $ this ->cursor );
345+ $ this ->moveCursor ($ matches [0 ] ?? '' );
346+ }
347+
348+ /**
349+ * @return array<int,array{0:string,1:int}>
350+ */
351+ protected function extractTokenStarts (string $ source ): array
352+ {
353+ preg_match_all (LexerOptions::blockStartRegex (), $ source , $ blocks , PREG_OFFSET_CAPTURE );
354+ preg_match_all (LexerOptions::variableStartRegex (), $ source , $ variables , PREG_OFFSET_CAPTURE );
355+
356+ $ positions = array_merge ($ blocks [0 ], $ variables [0 ]);
357+ usort ($ positions , fn (array $ a , array $ b ) => $ a [1 ] <=> $ b [1 ]);
358+
359+ return $ positions ;
360+ }
325361}
0 commit comments