2 * ============LICENSE_START=======================================================
3 * Copyright (C) 2024 Ericsson
4 * Modifications Copyright (C) 2024 OpenInfra Foundation Europe
5 * ================================================================================
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 * SPDX-License-Identifier: Apache-2.0
19 * ============LICENSE_END=========================================================
21 package org.oran.smo.yangtools.parser.model.parser;
23 import java.io.BufferedReader;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.nio.charset.Charset;
28 import java.util.ArrayList;
29 import java.util.List;
31 import org.oran.smo.yangtools.parser.ParserExecutionContext;
32 import org.oran.smo.yangtools.parser.findings.Finding;
33 import org.oran.smo.yangtools.parser.findings.ParserFindingType;
34 import org.oran.smo.yangtools.parser.model.YangModel;
35 import org.oran.smo.yangtools.parser.model.parser.Token.TokenType;
38 * The sole purpose of this class is to tokenize a YAM (supplied as an input stream). Tokens are generated
39 * for various constructs of special significance (for example, the {} characters). Comment sections are skipped.
40 * String concatenation (usage of the + character) is also performed.
42 * A new instance of this class must be created for each YAM parsed.
44 * @author Mark Hollmann
46 public class YamTokenizer {
48 private static final String EMPTY_STRING = "";
50 private boolean yangVersionStatementAlreadyHandled = false;
51 private StringParseRules stringParseRules = StringParseRules.YANG1;
53 private int currentLine = 0;
54 private int charCount = 0;
56 private boolean inBlockComment = false;
57 private boolean inDoubleQuoteString = false;
58 private boolean inSingleQuoteString = false;
59 private StringBuilder quotedString;
61 private final List<Token> tokens = new ArrayList<>(10000);
63 private final ParserExecutionContext context;
64 private final YangModel yangModel;
65 private final InputStream is;
67 public YamTokenizer(final ParserExecutionContext context, final YangModel yangModel, final InputStream is)
69 this.context = context;
70 this.yangModel = yangModel;
74 public TokenIterator tokenize() throws IOException {
76 final BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
80 while ((str = br.readLine()) != null) {
83 charCount += str.length();
84 charCount++; // +1 for the new-line character that the readLine() method will swallow
86 if (inDoubleQuoteString && str.trim().isEmpty()) {
87 quotedString.append('\n');
89 while (!str.isEmpty()) {
90 str = processString(str);
96 * Make sure there are no dangling quoted strings and block comments at the end of the document.
99 context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
100 "Document ends with an unclosed block comment. Be sure to close block comments with '*/'."));
103 if (inDoubleQuoteString || inSingleQuoteString) {
104 context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
105 "Document ends with non-terminated single- or double-quoted string."));
110 * Make sure that the last token is not a + token
112 if (!tokens.isEmpty() && tokens.get(tokens.size() - 1).type == TokenType.PLUS) {
113 context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
114 "Document ends with a '+' symbol."));
118 final List<Token> result = new ArrayList<>(tokens.size());
121 * Now we clean up the tokens and concatenate string tokens with their corresponding plus tokens.
122 * That takes care of constructs such as "'Hello ' + 'World!'" in the model.
124 for (int i = 0; i < tokens.size(); ++i) {
126 final Token oneToken = tokens.get(i);
128 if (oneToken.type == TokenType.QUOTED_STRING) {
130 * Concatenate quote strings that have + symbols between them
132 final StringBuilder sb = new StringBuilder(10000);
133 sb.append(oneToken.value);
134 final int lineNumberStart = oneToken.lineNumber;
136 while (i + 1 < tokens.size()) {
137 if (tokens.get(i + 1).type == TokenType.PLUS) {
138 if (tokens.get(i + 2).type == TokenType.QUOTED_STRING) {
139 // regular concatenation
140 sb.append(tokens.get(i + 2).value);
142 } else if (tokens.get(i + 2).type == TokenType.STRING) {
144 * We have the following: "Hello " + World. Technically disallowed by the spec, but we allow it.
146 context.addFinding(new Finding(yangModel, tokens.get(i + 2).lineNumber,
147 ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
148 "The '+' symbol is followed by an unquoted string (the string must be quoted for the '+' to work)."));
149 sb.append(tokens.get(i + 2).value);
151 } else if (tokens.get(i + 2).type == TokenType.PLUS) {
153 * We have a "+ +" in the document. Illegal syntax, but we allow it.
155 context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
156 ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
157 "The '+' symbol is repeated. Remove one of them."));
161 * Something else unexpected. Hard finding.
163 context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
164 ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
165 "The '+' symbol is not followed by quoted string."));
173 result.add(Token.newStringToken(lineNumberStart, sb.toString()));
175 } else if (oneToken.type == TokenType.PLUS) {
177 * A plus token cannot just exist by itself, it must always sit between quoted strings.
179 context.addFinding(new Finding(yangModel, oneToken.lineNumber,
180 ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
181 "Encountered standalone '+' symbol (are the strings on either side of it quoted?)."));
184 } else if (oneToken.type == TokenType.SEMI_COLON) {
185 result.add(oneToken);
187 * Consume any repeated semicolons. That's not a finding, but simply poor model editing.
189 while (i + 1 < tokens.size()) {
190 if (tokens.get(i + 1).type != TokenType.SEMI_COLON) {
193 context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
194 ParserFindingType.P055_SUPERFLUOUS_STATEMENT.toString(), "Multiple semicolons."));
199 * An unquoted string, or a left/right brace. Retain as-is.
201 result.add(oneToken);
205 return new TokenIterator(result);
208 public int getCharCount() {
212 public int getLineCount() {
217 * Process the supplied string. Whatever part of the string could not be processed will be returned for iterative
220 private String processString(final String str) {
222 if (inBlockComment) {
224 * Must try to find character sequence star-slash that ends the block comment.
226 final int indexOf = str.indexOf("*/");
228 // not found, the whole string is part of the comment, so we are done processing this string here
232 // found, so end the comment and return everything after the comment for further processing.
233 inBlockComment = false;
234 return str.substring(indexOf + 2);
236 } else if (inDoubleQuoteString) {
238 * Must try to find the closing quote character. Must be careful with character escaping.
239 * We trim anyway (idiotic YANG rule about indentation).
241 final String trimmed = str.trim();
243 for (int i = 0; i < trimmed.length(); ++i) {
244 final char charAt = trimmed.charAt(i);
247 // done with this double-quoted string. Create token and return leftovers, if any
248 tokens.add(Token.newQuotedStringToken(currentLine, quotedString.toString()));
249 handleYangVersionStatement();
251 inDoubleQuoteString = false;
252 return trimmed.substring(i + 1);
254 if (charAt == '\\') {
257 if (i >= trimmed.length()) {
259 * The backslash character is the last character on the line. This is always wrong. We will be nice,
260 * though, and assume what the user means is that the next line continues this line, which it does
261 * anyway. So basically we swallow the character and continue on with the next line.
263 issueFinding(ParserFindingType.P011_INVALID_CHARACTER_ESCAPING_IN_QUOTED_TEXT,
264 "Cannot have single backslash character at end of line.");
268 final char nextChar = trimmed.charAt(i);
270 if (nextChar == '\\') {
271 quotedString.append('\\');
272 } else if (nextChar == 'n') {
273 quotedString.append('\n');
274 } else if (nextChar == 't') {
275 quotedString.append('\t');
276 } else if (nextChar == '"') {
277 quotedString.append('"');
280 * RFC 6020 (YANG 1) does not explicitly say that any other character is disallowed.
281 * In contrast, RFC 7950 (YANG 1.1) explicitly states that any other character is
282 * not allowed. In either case, we handle it gracefully by appending the backslash
283 * and the character literally. But finding will only be issued for YANG 1.1 modules.
285 quotedString.append('\\');
286 quotedString.append(nextChar);
287 if (stringParseRules == StringParseRules.YANG1DOT1) {
288 issueFinding(ParserFindingType.P011_INVALID_CHARACTER_ESCAPING_IN_QUOTED_TEXT,
289 "Invalid character escaping (\\" + nextChar + ") inside double-quoted string.");
293 quotedString.append(charAt);
297 // end of quoted string not found yet, this line is fully consumed, continue...
298 quotedString.append(' ');
301 } else if (inSingleQuoteString) {
303 * We are within a string enclosed in single quotes ('). According to RFC6020:
304 * A single-quoted string (enclosed within ’ ’) preserves each character
305 * within the quotes. A single quote character cannot occur in a
306 * single-quoted string, even when preceded by a backslash.
308 * Try to find ending single quote
310 final int indexOfSingleQuote = str.indexOf('\'');
311 if (indexOfSingleQuote < 0) {
312 // not found, so the quoted text must stretch multiple lines. So we simply copy over the
313 // remainder of the string as-is (no trimming) plus a newline and done with this string here.
314 quotedString.append(str);
315 quotedString.append('\n');
318 // found, so go to the end of the quoted string, create token and then return the leftovers.
319 quotedString.append(str.substring(0, indexOfSingleQuote));
320 tokens.add(Token.newQuotedStringToken(currentLine, quotedString.toString()));
321 handleYangVersionStatement();
323 inSingleQuoteString = false;
324 return str.substring(indexOfSingleQuote + 1);
328 * Something else. Let's trim it first to remove all whitespace noise at beginning and end.
330 final String trimmed = str.trim();
331 if (trimmed.isEmpty()) {
338 if (trimmed.length() >= 2 && trimmed.startsWith("//")) {
339 // A single-comment line, we are done with the rest of the string.
342 if (trimmed.length() >= 2 && trimmed.startsWith("/*")) {
343 // block comment starts
344 inBlockComment = true;
345 return trimmed.substring(2);
349 * Check for special characters and create tokens as required
351 if (trimmed.charAt(0) == '{') {
352 tokens.add(Token.newLeftBraceToken(currentLine));
353 return trimmed.substring(1);
355 if (trimmed.charAt(0) == '}') {
356 tokens.add(Token.newRightBraceToken(currentLine));
357 return trimmed.substring(1);
359 if (trimmed.charAt(0) == ';') {
360 tokens.add(Token.newSemiColonToken(currentLine));
361 return trimmed.substring(1);
363 if (trimmed.charAt(0) == '+') {
364 tokens.add(Token.newPlusToken(currentLine));
365 return trimmed.substring(1);
369 * Check for beginning of double-quote or single-quote string
371 if (trimmed.charAt(0) == '"') {
372 quotedString = new StringBuilder(100);
373 inDoubleQuoteString = true;
374 String remainder = trimmed.substring(1);
376 * In case there are any spaces or tabs directly after the double-quote, these are retained.
378 while (!remainder.isEmpty() && (remainder.charAt(0) == ' ' || remainder.charAt(0) == '\t')) {
379 quotedString.append(remainder.charAt(0));
380 remainder = remainder.substring(1);
385 if (trimmed.charAt(0) == '\'') {
386 quotedString = new StringBuilder(100);
387 inSingleQuoteString = true;
388 return trimmed.substring(1);
392 * Some other string not in quotes. Consume one character at a time until we
393 * either reach the end of the string or a whitespace character or a special
394 * character. Note no character escaping allowed.
396 final StringBuilder unquotedString = new StringBuilder(100);
397 for (int i = 0; i < trimmed.length(); ++i) {
398 final char charAt = trimmed.charAt(i);
400 if (charAt == ';' || charAt == '{') {
402 tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
403 handleYangVersionStatement();
404 return trimmed.substring(i);
406 } else if (charAt == '"' || charAt == '\'') {
408 * This rule has changed in YANG 1.1. Prior to that, double-quotes and single-quotes
409 * were ok inside unquoted text; starting with YANG 1.1, these are not acceptable
410 * anymore. Either case we continue (best effort).
412 if (stringParseRules == StringParseRules.YANG1DOT1) {
413 issueFinding(ParserFindingType.P012_INVALID_CHARACTER_IN_UNQUOTED_TEXT,
414 "Single-quote or double-quote character not allowed inside non-quoted string.");
417 } else if (Character.isWhitespace(charAt)) {
418 // done with this unquoted string
419 tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
420 handleYangVersionStatement();
421 return trimmed.substring(i + 1);
424 unquotedString.append(charAt);
427 // reached string-end, then done as well.
428 tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
429 handleYangVersionStatement();
435 * In order to apply YANG 1 or YANG 1.1. parse rules we need to know whether this is a
436 * YANG 1 or YANG 1.1 module. The only way of finding this out is by looking at the
437 * "yang-version" statement. This method tries to find this statement, and its value
440 private void handleYangVersionStatement() {
442 if (yangVersionStatementAlreadyHandled) {
447 * We try to find tokens as follows:
448 * - Second-last token is a string token with value 'yang-version'
449 * - Last token is a string token with value 1 or 1.0 or 1.1.
451 * Note: "1.0" is not a valid YANG version according to RFC. But what are the chances
452 * of somebody using it...
455 final int tokensSize = tokens.size();
456 if (tokensSize < 2) {
460 final Token secondLastToken = tokens.get(tokensSize - 2);
461 if ((secondLastToken.type == TokenType.QUOTED_STRING || secondLastToken.type == TokenType.STRING) && secondLastToken.value
462 .equals("yang-version")) {
464 final Token lastToken = tokens.get(tokensSize - 1);
465 if ((lastToken.type == TokenType.QUOTED_STRING || lastToken.type == TokenType.STRING)) {
466 switch (lastToken.value) {
469 stringParseRules = StringParseRules.YANG1;
470 yangVersionStatementAlreadyHandled = true;
473 stringParseRules = StringParseRules.YANG1DOT1;
474 yangVersionStatementAlreadyHandled = true;
481 private void issueFinding(final ParserFindingType findingType, final String message) {
482 context.addFinding(new Finding(yangModel, currentLine, findingType.toString(), message));
486 * The string parse rules have slightly changed between YANG 1 and YANG 1.1.
488 private enum StringParseRules {