Page Speed Optimization Libraries  1.13.35.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
html_parse.h
Go to the documentation of this file.
1 /*
2  * Copyright 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http:///www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #ifndef PAGESPEED_KERNEL_HTML_HTML_PARSE_H_
20 #define PAGESPEED_KERNEL_HTML_HTML_PARSE_H_
21 
22 #include <cstdarg>
23 #include <cstddef>
24 #include <list>
25 #include <map>
26 #include <set>
27 #include <utility>
28 #include <vector>
29 
42 
43 namespace net_instaweb {
44 
45 class DocType;
46 class HtmlEvent;
47 class HtmlFilter;
48 class HtmlLexer;
49 class MessageHandler;
50 class Timer;
51 
52 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;
53 
88 class HtmlParse {
89  public:
90  explicit HtmlParse(MessageHandler* message_handler);
91  virtual ~HtmlParse();
92 
94 
97  void AddFilter(HtmlFilter* filter);
98 
104  bool StartParse(const StringPiece& url) {
105  return StartParseWithType(url, kContentTypeHtml);
106  }
107  bool StartParseWithType(const StringPiece& url,
108  const ContentType& content_type) {
109  return StartParseId(url, url, content_type);
110  }
111 
113  bool is_url_valid() const { return url_valid_; }
114 
119  virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
120  const ContentType& content_type);
121 
125  void SetUrlForTesting(const StringPiece& url);
126 
136  void ParseText(const char* content, int size) {
137  ParseTextInternal(content, size);
138  }
139  void ParseText(const StringPiece& sp) {
140  ParseTextInternal(sp.data(), sp.size());
141  }
142 
157  virtual void Flush();
158 
163  virtual void FinishParse();
164 
165 
167 
172  HtmlCdataNode* NewCdataNode(HtmlElement* parent,
173  const StringPiece& contents);
174  HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
175  const StringPiece& literal);
176  HtmlCommentNode* NewCommentNode(HtmlElement* parent,
177  const StringPiece& contents);
178  HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
179  const StringPiece& contents);
180  HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
181  const StringPiece& contents);
182  void InsertScriptAfterCurrent(StringPiece text, bool external);
183  void InsertScriptBeforeCurrent(StringPiece text, bool external);
184 
187  HtmlElement* AppendAnchor(StringPiece link, StringPiece text,
188  HtmlElement* parent);
189 
193 
196 
200  void InsertNodeBeforeNode(const HtmlNode* existing_node, HtmlNode* new_node);
201  void InsertNodeAfterNode(const HtmlNode* existing_node, HtmlNode* new_node);
202 
205  void InsertElementBeforeElement(const HtmlNode* existing_element,
206  HtmlNode* new_element) {
207  InsertNodeBeforeNode(existing_element, new_element);
208  }
209 
210  void InsertElementAfterElement(const HtmlNode* existing_element,
211  HtmlNode* new_element) {
212  InsertNodeAfterNode(existing_element, new_element);
213  }
214 
218  void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
219  void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);
220 
223  void InsertNodeBeforeCurrent(HtmlNode* new_node);
224 
229  void InsertNodeAfterCurrent(HtmlNode* new_node);
230 
236  bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
237  HtmlElement* new_parent);
238 
247  bool MoveCurrentInto(HtmlElement* new_parent);
248 
254  bool MoveCurrentBefore(HtmlNode* existing_node);
255 
259  bool DeleteNode(HtmlNode* node);
260 
267  bool DeleteSavingChildren(HtmlElement* element);
268 
276  bool MakeElementInvisible(HtmlElement* element);
277 
290  bool HasChildrenInFlushWindow(HtmlElement* element);
291 
294  bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
295 
298  HtmlElement* CloneElement(HtmlElement* in_element);
299 
300  HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) {
301  return NewElement(parent, MakeName(str));
302  }
303  HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) {
304  return NewElement(parent, MakeName(keyword));
305  }
306  HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name);
307 
314  const StringPiece& value) {
315  return element->AddAttribute(MakeName(keyword), value,
316  HtmlElement::DOUBLE_QUOTE);
317  }
318  void AddAttribute(HtmlElement* element, StringPiece name,
319  const StringPiece& value) {
320  return element->AddAttribute(MakeName(name), value,
321  HtmlElement::DOUBLE_QUOTE);
322  }
323  void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword,
324  const StringPiece& escaped_value) {
325  return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
326  HtmlElement::DOUBLE_QUOTE);
327  }
328  void SetAttributeName(HtmlElement::Attribute* attribute,
329  HtmlName::Keyword keyword) {
330  attribute->set_name(MakeName(keyword));
331  }
332 
333  HtmlName MakeName(const StringPiece& str);
334  HtmlName MakeName(HtmlName::Keyword keyword);
335 
336  bool IsRewritable(const HtmlNode* node) const;
341  bool CanAppendChild(const HtmlNode* node) const;
342 
343  void ClearElements();
344 
346  void DebugLogQueue();
347 
349  void DebugPrintQueue();
350 
352  friend class HtmlLexer;
353 
356  bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
357 
361  static bool IsLiteralTag(HtmlName::Keyword keyword);
362 
370  static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
371 
375  bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
376 
378  bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
379 
380  MessageHandler* message_handler() const { return message_handler_; }
383  const char* url() const { return url_.c_str(); }
385  const GoogleUrl& google_url() const { return google_url_; }
386  const char* id() const { return id_.c_str(); }
387  int line_number() const { return line_number_; }
390  return StringPrintf("%s:%d", id(), line_number());
391  }
392 
395  const DocType& doctype() const;
396 
398  void Info(const char* filename, int line, const char* msg, ...)
400  void Warning(const char* filename, int line, const char* msg, ...)
402  void Error(const char* filename, int line, const char* msg, ...)
403  INSTAWEB_PRINTF_FORMAT(4, 5);
404  void FatalError(const char* filename, int line, const char* msg, ...)
405  INSTAWEB_PRINTF_FORMAT(4, 5);
406 
407  void InfoV(const char* file, int line, const char *msg, va_list args);
408  void WarningV(const char* file, int line, const char *msg, va_list args);
409  void ErrorV(const char* file, int line, const char *msg, va_list args);
410  void FatalErrorV(const char* file, int line, const char* msg, va_list args);
411 
413  void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
414  void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
415  void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
416  void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
417 
420  void ShowProgress(const char* message);
421 
422  void InfoHereV(const char *msg, va_list args) {
423  InfoV(id_.c_str(), line_number_, msg, args);
424  }
425  void WarningHereV(const char *msg, va_list args) {
426  WarningV(id_.c_str(), line_number_, msg, args);
427  }
428  void ErrorHereV(const char *msg, va_list args) {
429  ErrorV(id_.c_str(), line_number_, msg, args);
430  }
431  void FatalErrorHereV(const char* msg, va_list args) {
432  FatalErrorV(id_.c_str(), line_number_, msg, args);
433  }
434 
435  void AddElement(HtmlElement* element, int line_number);
436  void CloseElement(HtmlElement* element, HtmlElement::Style style,
437  int line_number);
438 
440  void ApplyFilter(HtmlFilter* filter);
441 
444  void set_timer(Timer* timer) { timer_ = timer; }
445  Timer* timer() const { return timer_; }
446  void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; }
447 
450  void add_event_listener(HtmlFilter* listener);
451 
461  bool InsertComment(StringPiece sp);
462 
464  void set_size_limit(int64 x);
466  bool size_limit_exceeded() const;
467 
472  void SetDynamicallyDisabledFilterList(StringVector* list) {
473  dynamically_disabled_filter_list_ = list;
474  }
475 
501  void DeferCurrentNode();
502 
508  void RestoreDeferredNode(HtmlNode* deferred_node);
509 
512  return can_modify_urls_;
513  }
514 
515  protected:
516  typedef std::vector<HtmlFilter*> FilterVector;
517  typedef std::list<HtmlFilter*> FilterList;
518  typedef std::pair<HtmlNode*, HtmlEventList*> DeferredNode;
519  typedef std::map<const HtmlNode*, HtmlEventList*> NodeToEventListMap;
520  typedef std::map<HtmlFilter*, DeferredNode> FilterElementMap;
521  typedef std::set<const HtmlNode*> NodeSet;
522 
526  void BeginFinishParse();
527  void EndFinishParse();
528 
531  void Clear();
532 
534  size_t GetEventQueueSize();
535 
536  virtual void ParseTextInternal(const char* content, int size);
537 
540  if (!determine_filter_behavior_called_) {
541  determine_filter_behavior_called_ = true;
542  can_modify_urls_ = false;
544  }
545  }
546 
547  void DetermineFilterListBehavior(const FilterList& list) {
548  for (FilterList::const_iterator i = list.begin(); i != list.end(); ++i) {
549  CheckFilterBehavior(*i);
550  }
551  }
552 
553  void CheckFilterBehavior(HtmlFilter* filter);
554 
563  virtual void DetermineFiltersBehaviorImpl();
564 
584  void set_buffer_events(bool x) { buffer_events_ = x; }
585 
589 
591  void DisableFiltersInjectingScripts(const FilterList& filters);
592 
593  private:
594  void ApplyFilterHelper(HtmlFilter* filter);
595  HtmlEventListIterator Last();
596  bool IsInEventWindow(const HtmlEventListIterator& iter) const;
597  void InsertNodeBeforeEvent(const HtmlEventListIterator& event,
598  HtmlNode* new_node);
599  void InsertNodeAfterEvent(const HtmlEventListIterator& event,
600  HtmlNode* new_node);
601  bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to);
602  bool IsDescendantOf(const HtmlNode* possible_child,
603  const HtmlNode* possible_parent);
604  void SanityCheck();
605  void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
606  HtmlElement* actual);
607  void CheckParentFromAddEvent(HtmlEvent* event);
608  void FixParents(const HtmlEventListIterator& begin,
609  const HtmlEventListIterator& end_inclusive,
610  HtmlElement* new_parent);
611  void CoalesceAdjacentCharactersNodes();
612  void ClearEvents();
613  void EmitQueue(MessageHandler* handler);
614  inline void NextEvent();
615  void ClearDeferredNodes();
616  inline bool IsRewritableIgnoringDeferral(const HtmlNode* node) const;
617  inline bool IsRewritableIgnoringEnd(const HtmlNode* node) const;
618  void SetupScript(StringPiece text, bool external, HtmlElement* script);
619 
621  friend class HtmlTestingPeer;
622  void AddEvent(HtmlEvent* event);
623  void SetCurrent(HtmlNode* node);
624  void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
625  size_t symbol_table_size() const {
626  return string_table_.string_bytes_allocated();
627  }
628 
634  void DelayLiteralTag();
635 
636  FilterVector event_listeners_;
637  SymbolTableSensitive string_table_;
638  FilterList filters_;
639  HtmlLexer* lexer_;
640  Arena<HtmlNode> nodes_;
641  HtmlEventList queue_;
642  HtmlEventListIterator current_;
644  MessageHandler* message_handler_;
645  GoogleString url_;
646  GoogleUrl google_url_;
647  GoogleString id_;
648  int line_number_;
649  bool skip_increment_;
650  bool determine_filter_behavior_called_;
651  bool can_modify_urls_;
652  bool determine_enabled_filters_called_;
653  bool need_sanity_check_;
654  bool coalesce_characters_;
655  bool need_coalesce_characters_;
656  bool url_valid_;
657  bool log_rewrite_timing_;
658  bool running_filters_;
659  bool buffer_events_;
660  int64 parse_start_time_us_;
661  scoped_ptr<HtmlEvent> delayed_start_literal_;
662  Timer* timer_;
663  HtmlFilter* current_filter_;
664 
670  FilterElementMap open_deferred_nodes_;
671 
673  NodeToEventListMap deferred_nodes_;
674 
679  NodeSet deferred_deleted_nodes_;
680 
681  StringVector* dynamically_disabled_filter_list_;
682 
683 
684 };
685 
686 }
687 
688 #endif
void ApplyFilter(HtmlFilter *filter)
Run a filter on the current queue of parse nodes.
class GoogleUrl
Definition: google_url.h:58
GoogleString UrlLine() const
Returns URL (or id) and line number as a string, to be used in messages.
Definition: html_parse.h:389
Definition: html_filter.h:35
void AddFilter(HtmlFilter *filter)
Application methods for parsing functions and adding filters.
const ContentType & kContentTypeHtml
HTML-like (i.e. rewritable) text:
void set_size_limit(int64 x)
Sets the limit on the maximum number of bytes that should be parsed.
void InsertElementBeforeElement(const HtmlNode *existing_element, HtmlNode *new_element)
Definition: html_parse.h:205
bool DeleteNode(HtmlNode *node)
Definition: html_event.h:31
void add_event_listener(HtmlFilter *listener)
void set_timer(Timer *timer)
Definition: html_parse.h:444
const GoogleUrl & google_url() const
Gets a parsed GoogleUrl& corresponding to url().
Definition: html_parse.h:385
friend class HtmlLexer
Implementation helper with detailed knowledge of html parsing libraries.
Definition: html_parse.h:352
bool StartParse(const StringPiece &url)
Definition: html_parse.h:104
Definition: doctype.h:27
static bool IsLiteralTag(HtmlName::Keyword keyword)
void PrependChild(const HtmlElement *existing_parent, HtmlNode *new_child)
Definition: html_parse.h:88
void InfoHere(const char *msg,...) INSTAWEB_PRINTF_FORMAT(2
Report error message with current parsing filename and linenumber.
bool can_modify_urls()
Returns whether the filter pipeline can rewrite urls.
Definition: html_parse.h:511
Definition: html_element.h:42
void void void void void ShowProgress(const char *message)
bool AddParentToSequence(HtmlNode *first, HtmlNode *last, HtmlElement *new_parent)
void set_buffer_events(bool x)
Definition: html_parse.h:584
void RestoreDeferredNode(HtmlNode *deferred_node)
bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const
void DebugPrintQueue()
Print the HtmlEvent queue_ to stdout for debugging.
void InsertNodeAfterCurrent(HtmlNode *new_node)
bool MoveCurrentBefore(HtmlNode *existing_node)
const char * url() const
Definition: html_parse.h:383
bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const
Determines whether a tag allows brief termination in HTML, e.g. <tag>
void InsertNodeBeforeNode(const HtmlNode *existing_node, HtmlNode *new_node)
void DetermineFiltersBehavior()
Calls DetermineFiltersBehaviorImpl in an idempotent way.
Definition: html_parse.h:539
bool MakeElementInvisible(HtmlElement *element)
const DocType & doctype() const
std::string GoogleString
PAGESPEED_KERNEL_BASE_STRING_H_.
Definition: string.h:24
Style
Definition: html_element.h:50
void Info(const char *filename, int line, const char *msg,...) INSTAWEB_PRINTF_FORMAT(4
Interface for any caller to report an error message via the message handler.
bool ReplaceNode(HtmlNode *existing_node, HtmlNode *new_node)
size_t string_bytes_allocated() const
Definition: symbol_table.h:67
bool is_url_valid() const
Returns whether the google_url() URL is valid.
Definition: html_parse.h:113
HtmlElement * CloneElement(HtmlElement *in_element)
void SetDynamicallyDisabledFilterList(StringVector *list)
Definition: html_parse.h:472
void ParseText(const char *content, int size)
Definition: html_parse.h:136
Definition: html_node.h:43
virtual void Flush()
virtual bool StartParseId(const StringPiece &url, const StringPiece &id, const ContentType &content_type)
bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const
bool HasChildrenInFlushWindow(HtmlElement *element)
bool MoveCurrentInto(HtmlElement *new_parent)
void DisableFiltersInjectingScripts()
Definition: html_testing_peer.h:33
Definition: content_type.h:31
Keyword
Definition: html_name.h:39
bool DeleteSavingChildren(HtmlElement *element)
bool CanAppendChild(const HtmlNode *node) const
virtual void DetermineFiltersBehaviorImpl()
void AddAttribute(HtmlElement *element, HtmlName::Keyword keyword, const StringPiece &value)
Definition: html_parse.h:313
virtual void FinishParse()
size_t GetEventQueueSize()
Returns the number of events on the event queue.
#define INSTAWEB_PRINTF_FORMAT(x, y)
< Not GCC
Definition: printf_format.h:34
Definition: message_handler.h:39
void SetUrlForTesting(const StringPiece &url)
void AddAttribute(const Attribute &attr)
bool InsertComment(StringPiece sp)
bool size_limit_exceeded() const
Returns whether we have exceeded the size limit.
Definition: html_lexer.h:45
Timer interface, made virtual so it can be mocked for tests.
Definition: timer.h:27
HtmlElement * AppendAnchor(StringPiece link, StringPiece text, HtmlElement *parent)
void DebugLogQueue()
Log the HtmlEvent queue_ to the message_handler_ for debugging.
HtmlCdataNode * NewCdataNode(HtmlElement *parent, const StringPiece &contents)
Utility methods for implementing filters.
static bool IsSometimesLiteralTag(HtmlName::Keyword keyword)
void InsertNodeBeforeCurrent(HtmlNode *new_node)