上篇文章,写到编译gumbo成功,接下来测试一下gumbo提供的API如何运行
https://github.com/google/gumbo-parser#gumbo---a-pure-c-html5-parser中有一个简单的实例程序
实例一
#include "gumbo.h"
int main(int argc, char** argv) {
GumboOutput* output = gumbo_parse(argv[1]);
// Do stuff with output->root
gumbo_destroy_output(&kGumboDefaultOptions, output);
}
将文件保存为mygb.c保存在/usr/test目录下
将头文件gumbo.h保存在/usr/incude/gumbo目录下
将此前编译好的静态库文件libgumbo.a保存在同级目录中,如该目录 /usr/test/libgumbo.a
编译gcc -I /usr/include/gumbo mygb.c -o mygb.exe /usr/test/libgumbo.a -lpthread
huareal@gpx /usr/test
# gcc -I /usr/include/gumbo mygb.c -o mygb.exe /usr/test/libgumbo.a -lpthread
然后执行
huareal@gpx /usr/test
# ./mygb.exe
Segmentation fault (core dumped)
执行有错误
测试二
尝试编译gumbo自带的实例程序
gumbo\examples下面有几个实例
clean_text.cc
find_links.cc
get_title.c
positions_of_class.cc
首先分析get_title.c
尝试编译
huareal@gpx /usr/test
# gcc -I /usr/include/gumbo get_title.c -o gettitle.exe /usr/test/libgumbo.a -lpthread
编译成功
写一个one.html
<html>
<head>
<title>Hello,gumbo</title>
</head>
<body>
<h1>Test Gumbo</h1>
</body>
</html>
保存在/usr/test当前目录下
然后执行
huareal@gpx /usr/test
# ./gettitle.exe one.html
Hello,gumbo
执行成功
分析代码
a:主函数
int main(int argc, const char** argv) {
if (argc != 2) {
printf("Usage: get_title <html filename>.\n");
exit(EXIT_FAILURE);
}
const char* filename = argv[1];
FILE* fp = fopen(filename, "r");//打开文件
if (!fp) {
printf("File %s not found!\n", filename);
exit(EXIT_FAILURE);
}
char* input;
int input_length;
read_file(fp, &input, &input_length);//读取文件内容
GumboOutput* output = gumbo_parse_with_options(
&kGumboDefaultOptions, input, input_length); //解析html内容
const char* title = find_title(output->root);//获取html的中的title
printf("%s\n", title);
gumbo_destroy_output(&kGumboDefaultOptions, output);
free(input);
}
//读取文件内容,简单的c实现
static void read_file(FILE* fp, char** output, int* length) {
struct stat filestats;
int fd = fileno(fp);
fstat(fd, &filestats);
*length = filestats.st_size;
*output = malloc(*length + 1);
int start = 0;
int bytes_read;
while ((bytes_read = fread(*output + start, 1, *length - start, fp))) {
start += bytes_read;
}
}
可以进一步分析
gumbo_parse_with_options
//查看title的实现,针对节点树的遍历
static const char* find_title(const GumboNode* root) {
assert(root->type == GUMBO_NODE_ELEMENT);
assert(root->v.element.children.length >= 2);
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL; //首先获取head节点
int rootChileLength=root_children->length;
int i;
for (i = 0; i <rootChileLength; ++i) {
GumboNode* child = root_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
break;
}
}
assert(head != NULL);
GumboVector* head_children = &head->v.element.children;
int j;
for (j = 0; j < head_children->length; ++j) {
GumboNode* child = head_children->data[j];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) { //获取TITLE
if (child->v.element.children.length != 1) {
return "<empty title>";
}
GumboNode* title_text = child->v.element.children.data[0];
assert(title_text->type == GUMBO_NODE_TEXT); //获取NODE_TEXT
return title_text->v.text.text;
}
}
return "<no title found>";
}
先到这里,明天继续分析。