From 1dc48143f3385f80fa69f3115380055a53b540f9 Mon Sep 17 00:00:00 2001 From: Nimetu Date: Mon, 13 Apr 2015 20:52:34 +0300 Subject: [PATCH] Implement html parser using libxml2 --HG-- branch : libxml2-html-parser --- code/nel/include/nel/gui/group_html.h | 9 ++ code/nel/src/gui/group_html_parser.cpp | 178 +++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 code/nel/src/gui/group_html_parser.cpp diff --git a/code/nel/include/nel/gui/group_html.h b/code/nel/include/nel/gui/group_html.h index 7b974257a..1f3c8e4dc 100644 --- a/code/nel/include/nel/gui/group_html.h +++ b/code/nel/include/nel/gui/group_html.h @@ -102,6 +102,9 @@ namespace NLGUI // Browse virtual void browse (const char *url); + // parse html string using libxml2 parser + virtual bool parseHtml(std::string htmlString); + // Refresh void refresh(); @@ -199,6 +202,7 @@ namespace NLGUI int luaBeginElement(CLuaState &ls); int luaEndElement(CLuaState &ls); int luaShowDiv(CLuaState &ls); + int luaParseHtml(CLuaState &ls); REFLECT_EXPORT_START(CGroupHTML, CGroupScrollText) REFLECT_LUA_METHOD("browse", luaBrowse) @@ -210,6 +214,7 @@ namespace NLGUI REFLECT_LUA_METHOD("beginElement", luaBeginElement) REFLECT_LUA_METHOD("endElement", luaEndElement) REFLECT_LUA_METHOD("showDiv", luaShowDiv) + REFLECT_LUA_METHOD("parseHtml", luaParseHtml) REFLECT_STRING("url", getURL, setURL) REFLECT_FLOAT("timeout", getTimeout, setTimeout) REFLECT_EXPORT_END @@ -251,6 +256,10 @@ namespace NLGUI // the current request is terminated virtual void requestTerminated(HTRequest *request); + // libxml2 html parser functions + void htmlElement(xmlNode *node, int element_number); + void htmlWalkDOM(xmlNode *a_node); + // Get Home URL virtual std::string home(); diff --git a/code/nel/src/gui/group_html_parser.cpp b/code/nel/src/gui/group_html_parser.cpp new file mode 100644 index 000000000..fdb9a549f --- /dev/null +++ b/code/nel/src/gui/group_html_parser.cpp @@ -0,0 +1,178 @@ +// Ryzom - MMORPG Framework +// Copyright (C) 2010 Winch Gate Property Limited +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + + +#include "stdpch.h" + +#include +#include + +#include "nel/misc/types_nl.h" +#include "nel/gui/libwww.h" +#include "nel/gui/group_html.h" +#include "nel/gui/lua_ihm.h" + +using namespace std; +using namespace NLMISC; + +namespace NLGUI +{ + // *************************************************************************** + void CGroupHTML::htmlElement(xmlNode *node, int element_number) + { + SGML_dtd *HTML_DTD = HTML_dtd (); + + if (element_number < HTML_ELEMENTS) + { + CXMLAutoPtr ptr; + // load attributes into libwww structs + BOOL present[MAX_ATTRIBUTES]; + const char *value[MAX_ATTRIBUTES]; + std::string strvalues[MAX_ATTRIBUTES]; + + uint nbAttributes = std::min(MAX_ATTRIBUTES, HTML_DTD->tags[element_number].number_of_attributes); + for(uint i=0; itags[element_number].attributes[i].name)); + ptr = xmlGetProp(node, (const xmlChar *)name.c_str()); + if (ptr) + { + // copy xmlChar to string (xmlChar will be released) + strvalues[i] = (const char *)(ptr); + // now use string pointer in value[] array + value[i] = strvalues[i].c_str(); + present[i] = true; + } + else + { + value[i] = NULL; + present[i] = false; + } + } + + if (element_number == HTML_A) + { + addLink(element_number, present, value); + } + + beginElement(element_number, present, value); + } + else + { + beginUnparsedElement((const char *)(node->name), xmlStrlen(node->name)); + } + + // recursive - text content / child nodes + htmlWalkDOM(node->children); + + // closing tag + if (element_number < HTML_ELEMENTS) + { + endElement(element_number); + } + else + { + endUnparsedElement((const char *)(node->name), xmlStrlen(node->name)); + } + } + + // *************************************************************************** + // recursive function to walk html document + void CGroupHTML::htmlWalkDOM(xmlNode *a_node) + { + SGML_dtd *HTML_DTD = HTML_dtd (); + + uint element_number; + xmlNode *node = a_node; + while(node) + { + if (node->type == XML_TEXT_NODE) + { + addText((const char *)(node->content), xmlStrlen(node->content)); + } + else + if (node->type == XML_ELEMENT_NODE) + { + // find libwww tag + for(element_number = 0; element_numbername, (const xmlChar *)HTML_DTD->tags[element_number].name, xmlStrlen(node->name)) == 0) + break; + } + + htmlElement(node, element_number); + } + + // move into next sibling + node = node->next; + } + } + + // *************************************************************************** + bool CGroupHTML::parseHtml(std::string htmlString) + { + htmlParserCtxtPtr parser = htmlCreatePushParserCtxt(NULL, NULL, NULL, 0, NULL, XML_CHAR_ENCODING_NONE); + if (!parser) + { + nlwarning("Creating html parser context failed"); + return false; + } + + htmlCtxtUseOptions(parser, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); + + htmlParseChunk(parser, htmlString.c_str(), htmlString.size(), 0); + htmlParseChunk(parser, "", 0, 1); + + bool success = true; + if (parser->myDoc) + { + xmlNode *root = xmlDocGetRootElement(parser->myDoc); + if (root) + { + htmlWalkDOM(root); + } + else + { + nlwarning("html root node failed"); + success = false; + } + } + else + { + nlwarning("htmlstring parsing failed"); + success = false; + } + + htmlFreeParserCtxt(parser); + return success; + } + + // *************************************************************************** + int CGroupHTML::luaParseHtml(CLuaState &ls) + { + const char *funcName = "parseHtml"; + CLuaIHM::checkArgCount(ls, funcName, 1); + CLuaIHM::checkArgType(ls, funcName, 1, LUA_TSTRING); + std::string html = ls.toString(1); + + parseHtml(html); + + return 0; + } + +} +