gerrit.o-ran-sc Code Review - smo/teiv.git/blob

   1 /*
   2  *  ============LICENSE_START=======================================================
   3  *  Copyright (C) 2024 Ericsson
   4  *  Modifications Copyright (C) 2024 OpenInfra Foundation Europe
   5  *  ================================================================================
   6  *  Licensed under the Apache License, Version 2.0 (the "License");
   7  *  you may not use this file except in compliance with the License.
   8  *  You may obtain a copy of the License at
   9  *
  10  *        http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  *  Unless required by applicable law or agreed to in writing, software
  13  *  distributed under the License is distributed on an "AS IS" BASIS,
  14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  *  See the License for the specific language governing permissions and
  16  *  limitations under the License.
  17  *
  18  *  SPDX-License-Identifier: Apache-2.0
  19  *  ============LICENSE_END=========================================================
  20  */
  21 package org.oran.smo.yangtools.parser.model.parser;
  22
  23 import java.io.BufferedReader;
  24 import java.io.IOException;
  25 import java.io.InputStream;
  26 import java.io.InputStreamReader;
  27 import java.nio.charset.Charset;
  28 import java.util.ArrayList;
  29 import java.util.List;
  30
  31 import org.oran.smo.yangtools.parser.ParserExecutionContext;
  32 import org.oran.smo.yangtools.parser.findings.Finding;
  33 import org.oran.smo.yangtools.parser.findings.ParserFindingType;
  34 import org.oran.smo.yangtools.parser.model.YangModel;
  35 import org.oran.smo.yangtools.parser.model.parser.Token.TokenType;
  36
  37 /**
  38  * The sole purpose of this class is to tokenize a YAM (supplied as an input stream). Tokens are generated
  39  * for various constructs of special significance (for example, the {} characters). Comment sections are skipped.
  40  * String concatenation (usage of the + character) is also performed.
  41  * <p>
  42  * A new instance of this class must be created for each YAM parsed.
  43  *
  44  * @author Mark Hollmann
  45  */
  46 public class YamTokenizer {
  47
  48     private static final String EMPTY_STRING = "";
  49
  50     private boolean yangVersionStatementAlreadyHandled = false;
  51     private StringParseRules stringParseRules = StringParseRules.YANG1;
  52
  53     private int currentLine = 0;
  54     private int charCount = 0;
  55
  56     private boolean inBlockComment = false;
  57     private boolean inDoubleQuoteString = false;
  58     private boolean inSingleQuoteString = false;
  59     private StringBuilder quotedString;
  60
  61     private final List<Token> tokens = new ArrayList<>(10000);
  62
  63     private final ParserExecutionContext context;
  64     private final YangModel yangModel;
  65     private final InputStream is;
  66
  67     public YamTokenizer(final ParserExecutionContext context, final YangModel yangModel, final InputStream is)
  68             throws IOException {
  69         this.context = context;
  70         this.yangModel = yangModel;
  71         this.is = is;
  72     }
  73
  74     public TokenIterator tokenize() throws IOException {
  75
  76         final BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
  77
  78         String str;
  79
  80         while ((str = br.readLine()) != null) {
  81             currentLine++;
  82
  83             charCount += str.length();
  84             charCount++;                                        // +1 for the new-line character that the readLine() method will swallow
  85
  86             if (inDoubleQuoteString && str.trim().isEmpty()) {
  87                 quotedString.append('\n');
  88             } else {
  89                 while (!str.isEmpty()) {
  90                     str = processString(str);
  91                 }
  92             }
  93         }
  94
  95         /*
  96          * Make sure there are no dangling quoted strings and block comments at the end of the document.
  97          */
  98         if (inBlockComment) {
  99             context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
 100                     "Document ends with an unclosed block comment. Be sure to close block comments with '*/'."));
 101             return null;
 102         }
 103         if (inDoubleQuoteString || inSingleQuoteString) {
 104             context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
 105                     "Document ends with non-terminated single- or double-quoted string."));
 106             return null;
 107         }
 108
 109         /*
 110          * Make sure that the last token is not a + token
 111          */
 112         if (!tokens.isEmpty() && tokens.get(tokens.size() - 1).type == TokenType.PLUS) {
 113             context.addFinding(new Finding(yangModel, ParserFindingType.P014_INVALID_SYNTAX_AT_DOCUMENT_END,
 114                     "Document ends with a '+' symbol."));
 115             return null;
 116         }
 117
 118         final List<Token> result = new ArrayList<>(tokens.size());
 119
 120         /*
 121          * Now we clean up the tokens and concatenate string tokens with their corresponding plus tokens.
 122          * That takes care of constructs such as "'Hello ' + 'World!'" in the model.
 123          */
 124         for (int i = 0; i < tokens.size(); ++i) {
 125
 126             final Token oneToken = tokens.get(i);
 127
 128             if (oneToken.type == TokenType.QUOTED_STRING) {
 129                 /*
 130                  * Concatenate quote strings that have + symbols between them
 131                  */
 132                 final StringBuilder sb = new StringBuilder(10000);
 133                 sb.append(oneToken.value);
 134                 final int lineNumberStart = oneToken.lineNumber;
 135
 136                 while (i + 1 < tokens.size()) {
 137                     if (tokens.get(i + 1).type == TokenType.PLUS) {
 138                         if (tokens.get(i + 2).type == TokenType.QUOTED_STRING) {
 139                             // regular concatenation
 140                             sb.append(tokens.get(i + 2).value);
 141                             i += 2;
 142                         } else if (tokens.get(i + 2).type == TokenType.STRING) {
 143                             /*
 144                              * We have the following: "Hello " + World. Technically disallowed by the spec, but we allow it.
 145                              */
 146                             context.addFinding(new Finding(yangModel, tokens.get(i + 2).lineNumber,
 147                                     ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
 148                                     "The '+' symbol is followed by an unquoted string (the string must be quoted for the '+' to work)."));
 149                             sb.append(tokens.get(i + 2).value);
 150                             i += 2;
 151                         } else if (tokens.get(i + 2).type == TokenType.PLUS) {
 152                             /*
 153                              * We have a "+ +" in the document. Illegal syntax, but we allow it.
 154                              */
 155                             context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
 156                                     ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
 157                                     "The '+' symbol is repeated. Remove one of them."));
 158                             i++;
 159                         } else {
 160                             /*
 161                              * Something else unexpected. Hard finding.
 162                              */
 163                             context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
 164                                     ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
 165                                     "The '+' symbol is not followed by quoted string."));
 166                             return null;
 167                         }
 168                     } else {
 169                         break;
 170                     }
 171                 }
 172
 173                 result.add(Token.newStringToken(lineNumberStart, sb.toString()));
 174
 175             } else if (oneToken.type == TokenType.PLUS) {
 176                 /*
 177                  * A plus token cannot just exist by itself, it must always sit between quoted strings.
 178                  */
 179                 context.addFinding(new Finding(yangModel, oneToken.lineNumber,
 180                         ParserFindingType.P015_INVALID_SYNTAX_IN_DOCUMENT.toString(),
 181                         "Encountered standalone '+' symbol (are the strings on either side of it quoted?)."));
 182                 return null;
 183
 184             } else if (oneToken.type == TokenType.SEMI_COLON) {
 185                 result.add(oneToken);
 186                 /*
 187                  * Consume any repeated semicolons. That's not a finding, but simply poor model editing.
 188                  */
 189                 while (i + 1 < tokens.size()) {
 190                     if (tokens.get(i + 1).type != TokenType.SEMI_COLON) {
 191                         break;
 192                     }
 193                     context.addFinding(new Finding(yangModel, tokens.get(i + 1).lineNumber,
 194                             ParserFindingType.P055_SUPERFLUOUS_STATEMENT.toString(), "Multiple semicolons."));
 195                     i++;
 196                 }
 197             } else {
 198                 /*
 199                  * An unquoted string, or a left/right brace. Retain as-is.
 200                  */
 201                 result.add(oneToken);
 202             }
 203         }
 204
 205         return new TokenIterator(result);
 206     }
 207
 208     public int getCharCount() {
 209         return charCount;
 210     }
 211
 212     public int getLineCount() {
 213         return currentLine;
 214     }
 215
 216     /**
 217      * Process the supplied string. Whatever part of the string could not be processed will be returned for iterative
 218      * processing.
 219      */
 220     private String processString(final String str) {
 221
 222         if (inBlockComment) {
 223             /*
 224              * Must try to find character sequence star-slash that ends the block comment.
 225              */
 226             final int indexOf = str.indexOf("*/");
 227             if (indexOf < 0) {
 228                 // not found, the whole string is part of the comment, so we are done processing this string here
 229                 return EMPTY_STRING;
 230             }
 231
 232             // found, so end the comment and return everything after the comment for further processing.
 233             inBlockComment = false;
 234             return str.substring(indexOf + 2);
 235
 236         } else if (inDoubleQuoteString) {
 237             /*
 238              * Must try to find the closing quote character. Must be careful with character escaping.
 239              * We trim anyway (idiotic YANG rule about indentation).
 240              */
 241             final String trimmed = str.trim();
 242
 243             for (int i = 0; i < trimmed.length(); ++i) {
 244                 final char charAt = trimmed.charAt(i);
 245
 246                 if (charAt == '"') {
 247                     // done with this double-quoted string. Create token and return leftovers, if any
 248                     tokens.add(Token.newQuotedStringToken(currentLine, quotedString.toString()));
 249                     handleYangVersionStatement();
 250
 251                     inDoubleQuoteString = false;
 252                     return trimmed.substring(i + 1);
 253                 }
 254                 if (charAt == '\\') {
 255                     // possible escaping
 256                     i++;
 257                     if (i >= trimmed.length()) {
 258                         /*
 259                          * The backslash character is the last character on the line. This is always wrong. We will be nice,
 260                          * though, and assume what the user means is that the next line continues this line, which it does
 261                          * anyway. So basically we swallow the character and continue on with the next line.
 262                          */
 263                         issueFinding(ParserFindingType.P011_INVALID_CHARACTER_ESCAPING_IN_QUOTED_TEXT,
 264                                 "Cannot have single backslash character at end of line.");
 265                         return EMPTY_STRING;
 266                     }
 267
 268                     final char nextChar = trimmed.charAt(i);
 269
 270                     if (nextChar == '\\') {
 271                         quotedString.append('\\');
 272                     } else if (nextChar == 'n') {
 273                         quotedString.append('\n');
 274                     } else if (nextChar == 't') {
 275                         quotedString.append('\t');
 276                     } else if (nextChar == '"') {
 277                         quotedString.append('"');
 278                     } else {
 279                         /*
 280                          * RFC 6020 (YANG 1) does not explicitly say that any other character is disallowed.
 281                          * In contrast, RFC 7950 (YANG 1.1) explicitly states that any other character is
 282                          * not allowed. In either case, we handle it gracefully by appending the backslash
 283                          * and the character literally. But finding will only be issued for YANG 1.1 modules.
 284                          */
 285                         quotedString.append('\\');
 286                         quotedString.append(nextChar);
 287                         if (stringParseRules == StringParseRules.YANG1DOT1) {
 288                             issueFinding(ParserFindingType.P011_INVALID_CHARACTER_ESCAPING_IN_QUOTED_TEXT,
 289                                     "Invalid character escaping (\\" + nextChar + ") inside double-quoted string.");
 290                         }
 291                     }
 292                 } else {
 293                     quotedString.append(charAt);
 294                 }
 295             }
 296
 297             // end of quoted string not found yet, this line is fully consumed, continue...
 298             quotedString.append(' ');
 299             return EMPTY_STRING;
 300
 301         } else if (inSingleQuoteString) {
 302             /*
 303              * We are within a string enclosed in single quotes ('). According to RFC6020:
 304              * A single-quoted string (enclosed within ’ ’) preserves each character
 305              * within the quotes. A single quote character cannot occur in a
 306              * single-quoted string, even when preceded by a backslash.
 307              *
 308              * Try to find ending single quote
 309              */
 310             final int indexOfSingleQuote = str.indexOf('\'');
 311             if (indexOfSingleQuote < 0) {
 312                 // not found, so the quoted text must stretch multiple lines. So we simply copy over the
 313                 // remainder of the string as-is (no trimming) plus a newline and done with this string here.
 314                 quotedString.append(str);
 315                 quotedString.append('\n');
 316                 return EMPTY_STRING;
 317             }
 318             // found, so go to the end of the quoted string, create token and then return the leftovers.
 319             quotedString.append(str.substring(0, indexOfSingleQuote));
 320             tokens.add(Token.newQuotedStringToken(currentLine, quotedString.toString()));
 321             handleYangVersionStatement();
 322
 323             inSingleQuoteString = false;
 324             return str.substring(indexOfSingleQuote + 1);
 325
 326         } else {
 327             /*
 328              * Something else. Let's trim it first to remove all whitespace noise at beginning and end.
 329              */
 330             final String trimmed = str.trim();
 331             if (trimmed.isEmpty()) {
 332                 return EMPTY_STRING;
 333             }
 334
 335             /*
 336              * Check for comments
 337              */
 338             if (trimmed.length() >= 2 && trimmed.startsWith("//")) {
 339                 // A single-comment line, we are done with the rest of the string.
 340                 return EMPTY_STRING;
 341             }
 342             if (trimmed.length() >= 2 && trimmed.startsWith("/*")) {
 343                 // block comment starts
 344                 inBlockComment = true;
 345                 return trimmed.substring(2);
 346             }
 347
 348             /*
 349              * Check for special characters and create tokens as required
 350              */
 351             if (trimmed.charAt(0) == '{') {
 352                 tokens.add(Token.newLeftBraceToken(currentLine));
 353                 return trimmed.substring(1);
 354             }
 355             if (trimmed.charAt(0) == '}') {
 356                 tokens.add(Token.newRightBraceToken(currentLine));
 357                 return trimmed.substring(1);
 358             }
 359             if (trimmed.charAt(0) == ';') {
 360                 tokens.add(Token.newSemiColonToken(currentLine));
 361                 return trimmed.substring(1);
 362             }
 363             if (trimmed.charAt(0) == '+') {
 364                 tokens.add(Token.newPlusToken(currentLine));
 365                 return trimmed.substring(1);
 366             }
 367
 368             /*
 369              * Check for beginning of double-quote or single-quote string
 370              */
 371             if (trimmed.charAt(0) == '"') {
 372                 quotedString = new StringBuilder(100);
 373                 inDoubleQuoteString = true;
 374                 String remainder = trimmed.substring(1);
 375                 /*
 376                  * In case there are any spaces or tabs directly after the double-quote, these are retained.
 377                  */
 378                 while (!remainder.isEmpty() && (remainder.charAt(0) == ' ' || remainder.charAt(0) == '\t')) {
 379                     quotedString.append(remainder.charAt(0));
 380                     remainder = remainder.substring(1);
 381                 }
 382
 383                 return remainder;
 384             }
 385             if (trimmed.charAt(0) == '\'') {
 386                 quotedString = new StringBuilder(100);
 387                 inSingleQuoteString = true;
 388                 return trimmed.substring(1);
 389             }
 390
 391             /*
 392              * Some other string not in quotes. Consume one character at a time until we
 393              * either reach the end of the string or a whitespace character or a special
 394              * character. Note no character escaping allowed.
 395              */
 396             final StringBuilder unquotedString = new StringBuilder(100);
 397             for (int i = 0; i < trimmed.length(); ++i) {
 398                 final char charAt = trimmed.charAt(i);
 399
 400                 if (charAt == ';' || charAt == '{') {
 401                     // reached the end
 402                     tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
 403                     handleYangVersionStatement();
 404                     return trimmed.substring(i);
 405
 406                 } else if (charAt == '"' || charAt == '\'') {
 407                     /*
 408                      * This rule has changed in YANG 1.1. Prior to that, double-quotes and single-quotes
 409                      * were ok inside unquoted text; starting with YANG 1.1, these are not acceptable
 410                      * anymore. Either case we continue (best effort).
 411                      */
 412                     if (stringParseRules == StringParseRules.YANG1DOT1) {
 413                         issueFinding(ParserFindingType.P012_INVALID_CHARACTER_IN_UNQUOTED_TEXT,
 414                                 "Single-quote or double-quote character not allowed inside non-quoted string.");
 415                     }
 416
 417                 } else if (Character.isWhitespace(charAt)) {
 418                     // done with this unquoted string
 419                     tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
 420                     handleYangVersionStatement();
 421                     return trimmed.substring(i + 1);
 422                 }
 423
 424                 unquotedString.append(charAt);
 425             }
 426
 427             // reached string-end, then done as well.
 428             tokens.add(Token.newStringToken(currentLine, unquotedString.toString()));
 429             handleYangVersionStatement();
 430             return EMPTY_STRING;
 431         }
 432     }
 433
 434     /**
 435      * In order to apply YANG 1 or YANG 1.1. parse rules we need to know whether this is a
 436      * YANG 1 or YANG 1.1 module. The only way of finding this out is by looking at the
 437      * "yang-version" statement. This method tries to find this statement, and its value
 438      * ("1" or "1.1").
 439      */
 440     private void handleYangVersionStatement() {
 441
 442         if (yangVersionStatementAlreadyHandled) {
 443             return;
 444         }
 445
 446         /*
 447          * We try to find tokens as follows:
 448          * - Second-last token is a string token with value 'yang-version'
 449          * - Last token is a string token with value 1 or 1.0 or 1.1.
 450          *
 451          * Note: "1.0" is not a valid YANG version according to RFC. But what are the chances
 452          * of somebody using it...
 453          */
 454
 455         final int tokensSize = tokens.size();
 456         if (tokensSize < 2) {
 457             return;
 458         }
 459
 460         final Token secondLastToken = tokens.get(tokensSize - 2);
 461         if ((secondLastToken.type == TokenType.QUOTED_STRING || secondLastToken.type == TokenType.STRING) && secondLastToken.value
 462                 .equals("yang-version")) {
 463
 464             final Token lastToken = tokens.get(tokensSize - 1);
 465             if ((lastToken.type == TokenType.QUOTED_STRING || lastToken.type == TokenType.STRING)) {
 466                 switch (lastToken.value) {
 467                     case "1":
 468                     case "1.0":
 469                         stringParseRules = StringParseRules.YANG1;
 470                         yangVersionStatementAlreadyHandled = true;
 471                         break;
 472                     case "1.1":
 473                         stringParseRules = StringParseRules.YANG1DOT1;
 474                         yangVersionStatementAlreadyHandled = true;
 475                         break;
 476                 }
 477             }
 478         }
 479     }
 480
 481     private void issueFinding(final ParserFindingType findingType, final String message) {
 482         context.addFinding(new Finding(yangModel, currentLine, findingType.toString(), message));
 483     }
 484
 485     /**
 486      * The string parse rules have slightly changed between YANG 1 and YANG 1.1.
 487      */
 488     private enum StringParseRules {
 489         YANG1,
 490         YANG1DOT1
 491     }
 492 }