Page Speed Optimization Libraries  1.13.35.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
html_lexer.h
Go to the documentation of this file.
1 /*
2  * Copyright 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http:///www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #ifndef PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
20 #define PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
21 
22 #include <vector>
23 
32 
33 namespace net_instaweb {
34 
35 class HtmlParse;
36 
45 class HtmlLexer {
46  public:
47  explicit HtmlLexer(HtmlParse* html_parse);
48  ~HtmlLexer();
49 
51  void StartParse(const StringPiece& id, const ContentType& content_type);
52 
55  void Parse(const char* text, int size);
56 
58  void FinishParse();
59 
61  bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
62 
66  static bool IsLiteralTag(HtmlName::Keyword keyword);
67 
72  static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
73 
75  bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
76 
78  bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
79 
81  void DebugPrintStack();
82 
85  HtmlElement* Parent() const;
86 
89  const DocType& doctype() const { return doctype_; }
90 
92  void set_size_limit(int64 x) { size_limit_ = x; }
93 
96  bool size_limit_exceeded() const { return size_limit_exceeded_; }
97 
98  private:
100  inline void EvalStart(char c);
101  inline void EvalTag(char c);
102  inline void EvalTagOpen(char c);
103  inline void EvalTagCloseNoName(char c);
104  inline void EvalTagClose(char c);
105  inline void EvalTagBriefClose(char c);
106  inline void EvalCommentStart1(char c);
107  inline void EvalCommentStart2(char c);
108  inline void EvalCommentBody(char c);
109  inline void EvalCommentEnd1(char c);
110  inline void EvalCommentEnd2(char c);
111  inline void EvalCdataStart1(char c);
112  inline void EvalCdataStart2(char c);
113  inline void EvalCdataStart3(char c);
114  inline void EvalCdataStart4(char c);
115  inline void EvalCdataStart5(char c);
116  inline void EvalCdataStart6(char c);
117  inline void EvalCdataBody(char c);
118  inline void EvalCdataEnd1(char c);
119  inline void EvalCdataEnd2(char c);
120  inline void EvalAttribute(char c);
121  inline void EvalAttrName(char c);
122  inline void EvalAttrNameSpace(char c);
123  inline void EvalAttrEq(char c);
124  inline void EvalAttrVal(char c);
125  inline void EvalAttrValSq(char c);
126  inline void EvalAttrValDq(char c);
127  inline void EvalLiteralTag(char c);
128  inline void EvalScriptTag(char c);
129  inline void EvalDirective(char c);
130  inline void EvalBogusComment(char c);
131 
134  void MakeElement();
135 
136  void MakeAttribute(bool has_value);
137  void FinishAttribute(char c, bool has_value, bool brief_close);
138 
139  void EmitCdata();
140  void EmitComment();
141  void EmitLiteral();
142  void EmitTagOpen(bool allow_implicit_close);
143  void EmitTagClose(HtmlElement::Style style);
144  void EmitTagBriefClose();
145  void EmitDirective();
146  void Restart(char c);
147 
149  void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
150 
161  HtmlElement* PopElementMatchingTag(const StringPiece& tag);
162 
163  HtmlElement* PopElement();
164  void CloseElement(HtmlElement* element, HtmlElement::Style style);
165 
169  static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
170 
172  static inline bool IsLegalTagFirstChar(char c);
174  static inline bool IsLegalTagChar(char c);
175 
177  static inline bool IsLegalAttrNameChar(char c);
178 
185  enum State {
186  START,
187  TAG,
188  TAG_CLOSE_NO_NAME,
189  TAG_CLOSE,
190  TAG_CLOSE_TERMINATE,
191  TAG_OPEN,
192  TAG_BRIEF_CLOSE,
193  COMMENT_START1,
194  COMMENT_START2,
195  COMMENT_BODY,
196  COMMENT_END1,
197  COMMENT_END2,
198  CDATA_START1,
199  CDATA_START2,
200  CDATA_START3,
201  CDATA_START4,
202  CDATA_START5,
203  CDATA_START6,
204  CDATA_BODY,
205  CDATA_END1,
206  CDATA_END2,
207  TAG_ATTRIBUTE,
208  TAG_ATTR_NAME,
209  TAG_ATTR_NAME_SPACE,
210  TAG_ATTR_EQ,
211  TAG_ATTR_VAL,
212  TAG_ATTR_VALDQ,
213  TAG_ATTR_VALSQ,
214  LITERAL_TAG,
215  SCRIPT_TAG,
216  DIRECTIVE,
217  BOGUS_COMMENT,
218  };
219 
220  HtmlParse* html_parse_;
221  State state_;
222  GoogleString token_;
223  GoogleString literal_;
224  GoogleString attr_name_;
225  GoogleString attr_value_;
226  HtmlElement::QuoteStyle attr_quote_;
227  bool has_attr_value_;
228  HtmlElement* element_;
229  int line_;
230  int tag_start_line_;
231  GoogleString id_;
232  GoogleString literal_close_;
233  bool script_html_comment_;
234  bool script_html_comment_script_;
235  bool discard_until_start_state_for_error_recovery_;
238 
239  ContentType content_type_;
240  DocType doctype_;
241 
242  std::vector<HtmlElement*> element_stack_;
243 
246  bool size_limit_exceeded_;
249  bool skip_parsing_;
250  int64 num_bytes_parsed_;
251  int64 size_limit_;
252 
253 
254 };
255 
256 }
257 
258 #endif
bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const
Determines whether a tag should be terminated in HTML.
bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const
Determines whether it's OK to leave a tag unclosed.
QuoteStyle
Various ways things can be quoted (or not)
Definition: html_element.h:60
void FinishParse()
Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
void DebugPrintStack()
Print element stack to stdout (for debugging).
const DocType & doctype() const
Definition: html_lexer.h:89
Definition: doctype.h:27
void StartParse(const StringPiece &id, const ContentType &content_type)
Initialize a new parse session, id is only used for error messages.
Definition: html_parse.h:88
Definition: html_element.h:42
void Parse(const char *text, int size)
void set_size_limit(int64 x)
Sets the limit on the maximum number of bytes that should be parsed.
Definition: html_lexer.h:92
bool size_limit_exceeded() const
Definition: html_lexer.h:96
std::string GoogleString
PAGESPEED_KERNEL_BASE_STRING_H_.
Definition: string.h:24
Style
Definition: html_element.h:50
bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const
Determines whether a tag can be terminated briefly (e.g. <tag>)
Definition: content_type.h:31
Keyword
Definition: html_name.h:39
#define INSTAWEB_PRINTF_FORMAT(x, y)
< Not GCC
Definition: printf_format.h:34
static bool IsLiteralTag(HtmlName::Keyword keyword)
static bool IsSometimesLiteralTag(HtmlName::Keyword keyword)
Definition: html_lexer.h:45
HtmlElement * Parent() const