cpp 解析HTML之 htmlcxx

益英逸

2023-12-01

html与xml格式上比较相似，但xml不并一定能支持html的解析。这里介绍一个c++解析html的开源项目： htmlcxx

一、代码示例

1、项目源码下载之后，使用vs打开即可，默认为生成.lib静态库及MTd模式，可以在属性中修改指定为MD/MDd（MTd在实际项目中运用相对比较少，具体得看宿主项目的编译模式为MT还是MD）。
2、HTML::ParserDom 对象可以将 html文本字符串解析为Dom格式，通过遍历dom 树形结构的指定节点，可以获取我们需要的数据。如下示例，非常简单。

 #include <htmlcxx/html/ParserDom.h>
  
  //Parse some html code
  string html = "<html><body>hey</body></html>";
  HTML::ParserDom parser;
  tree<HTML::Node> dom = parser.parseTree(html);
  
  //Print whole DOM tree
  cout << dom << endl;
  
  //Dump all links in the tree
  tree<HTML::Node>::iterator it = dom.begin();
  tree<HTML::Node>::iterator end = dom.end();
  for (; it != end; ++it)
  {
  	if (it->tagName() == "A")
  	{
  		it->parseAttributes();
  		cout << it->attributes("href");
  	}
  }
  
  //Dump all text of the document
  it = dom.begin();
  end = dom.end();
  for (; it != end; ++it)
  {
  	if ((!it->isTag()) && (!it->isComment()))
  	{
  		cout << it->text();
  	}
  }

二、源码分析和学习

1、实现一个ci_string类型

//ci_string.h

#ifndef __CI_STRING__
#define __CI_STRING__

#include <cctype>
#include <string>

struct ci_char_traits : public std::char_traits<char>

// just inherit all the other functions
//  that we don't need to override
{
	static bool eq( char c1, char c2 ) {
		return tolower(c1) == tolower(c2);
	}

	static bool ne( char c1, char c2 ) {
		return tolower(c1) != tolower(c2);
	}

	static bool lt( char c1, char c2 ) {
		return tolower(c1) < tolower(c2);
	}

	static int compare( const char* s1,
			const char* s2,
			size_t n ) {
		#ifdef WIN32
		return _strnicmp(s1, s2, n);
		#else
		return strncasecmp( s1, s2, n );
		#endif
		// if available on your compiler,
		//  otherwise you can roll your own
	}

	static const char*
		find( const char* s, int n, char a ) {
			while( n-- > 0 && tolower(*s) != tolower(a) ) {
				++s;
			}
			return s;
		}
};

typedef std::basic_string<char, ci_char_traits> ci_string;

#endif

这里思考一下，我们知道

using string  = basic_string<char, char_traits<char>, allocator<char>>;

这里作者的本意应该也是在std::string 类型的基础上实现几个静态成员函数，那为什么不直接继承于 std::string，而是 public std::char_traits呢？
这里一定要注意，basic_string的析构函数是不带virtual关键字，同样，std::string 也没有带virtual关键字的析构函数。假设一个子类继承于std::string，那么子类对象将无法被正确执行析构函数而导致错误。

2、debug.h

//debug.h

#ifdef DEBUGP
#undef DEBUGP
#endif

#ifdef __cplusplus
#include <cstdio>
#if defined(WIN32) && !defined(__MINGW32__)
#include <cstring>
#endif
#else
#include <stdio.h>
#if defined(WIN32) && !defined(__MINGW32__)
#include <string.h>
#endif
#endif

#if defined(WIN32) && !defined(__MINGW32__)
#ifndef __DEBUG_H__
#define __DEBUG_H__
#include <stdarg.h>
inline void debugprintf(const char *format, ...)
{
    va_list ap;
	char *f = NULL;
	const char *p="%s:%d ";
	size_t plen = strlen(p);
    va_start(ap, format);
	f = (char *)malloc(plen + strlen(format) + 1);
	if (!f) return;
	memcpy(f, p, plen);
	memcpy(f + plen, format, strlen(format) + 1);
    vfprintf(stderr, f, ap);
    va_end(ap);
	free(f);
}
inline void dummyprintf(const char *format, ...)
{}
#endif
#endif

#ifdef DEBUG
#if defined(WIN32) && !defined(__MINGW32__)
#define DEBUGP debugprintf
#else
#define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0)
#endif
#else
#if defined(WIN32) && !defined(__MINGW32__)
#define DEBUGP dummyprintf
#else
#define DEBUGP(args...)
#endif
#endif

这哪有两个需要注意和学习的地方。
1、几个常用的宏

__cplusplus  	判断当前为C++的宏
DEBUG			判断当前为Debug模式的宏
__FILE__		文件名
__LINE__		当前行
__FUNCTION__	函数名

2、可变参函数 …

    va_list ap; ///<<==
	char *f = NULL;
	const char *p="%s:%d ";
	size_t plen = strlen(p);
    va_start(ap, format); ///<<==
	f = (char *)malloc(plen + strlen(format) + 1);
	if (!f) return;
	memcpy(f, p, plen);
	memcpy(f + plen, format, strlen(format) + 1);
    vfprintf(stderr, f, ap);
    va_end(ap); ///<<==

后面接下片吧。忙其他的去了~ 小羊人的自律

cpp 解析HTML之 htmlcxx

一、代码示例

二、源码分析和学习

相关阅读

相关文章

相关问答

相关文档