# This file is part of the standard library of Pycopy project, minimalist # and lightweight Python implementation. # # https://github.com/pfalcon/pycopy # https://github.com/pfalcon/pycopy-lib # # The MIT License (MIT) # # Copyright (c) 2018-2019 Paul Sokolovsky # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. TEXT = "TEXT" START_TAG = "START_TAG" #START_TAG_DONE = "START_TAG_DONE" END_TAG = "END_TAG" PI = "PI" #PI_DONE = "PI_DONE" ATTR = "ATTR" #ATTR_VAL = "ATTR_VAL" class XMLSyntaxError(Exception): pass class XMLTokenizer: def __init__(self, f): self.f = f self.c = "" self.nextch() def getch(self): c = self.c self.nextch() return c def eof(self): return self.c == "" def nextch(self): self.c = self.f.read(1) def skip_ws(self): while self.c.isspace(): self.nextch() def isident(self): self.skip_ws() return self.c.isalpha() def getident(self): self.skip_ws() ident = "" while self.c: c = self.c if not(c.isalpha() or c.isdigit() or c in "_-."): break ident += self.getch() return ident def putnsident(self, res): ns = "" ident = self.getident() if self.c == ":": self.nextch() ns = ident ident = self.getident() res[1] = ns res[2] = ident def match(self, c): self.skip_ws() if self.c == c: self.nextch() return True return False def expect(self, c): if not self.match(c): raise XMLSyntaxError def lex_attrs_till(self, res): while self.isident(): res[0] = ATTR self.putnsident(res) self.expect("=") quote = self.getch() if quote != '"' and quote != "'": raise XMLSyntaxError val = "" while self.c != quote: val += self.getch() self.expect(quote) res[3] = val yield res res[3] = None def tokenize(self): res = [None, None, None, None] while not self.eof(): if self.match("<"): if self.match("/"): res[0] = END_TAG self.putnsident(res) yield res self.expect(">") elif self.match("?"): res[0] = PI res[1] = self.getident() yield res yield from self.lex_attrs_till(res) self.expect("?") self.expect(">") elif self.match("!"): self.expect("-") self.expect("-") last3 = '' while True: last3 = last3[-2:] + self.getch() if last3 == "-->": break else: res[0] = START_TAG self.putnsident(res) ns = res[1] tag = res[2] yield res yield from self.lex_attrs_till(res) if self.match("/"): res[0] = END_TAG res[1] = ns res[2] = tag yield res self.expect(">") else: text = "" while self.c and self.c != "<": text += self.getch() if text: res[0] = TEXT res[1] = text res[2] = None yield res def gfind(gen, pred): for i in gen: if pred(i): return i def text_of(gen, tag): # Return text content of a leaf tag from tokenizer stream def match_tag(t): if t[0] != START_TAG: return False if isinstance(tag, tuple): return t[1] == tag[0] and t[2] == tag[1] return t[2] == tag gfind(gen, match_tag) # Assumes no attributes res = next(gen) assert res[0] == TEXT return res[1] def tokenize(file): return XMLTokenizer(file).tokenize()