有关Html Parser的使用范例， URL解析器

傅明知

2023-12-01

摘自陈华的BLOG, 如果你不知道陈华是谁，请访问www.kooxoo.com^_^
/*
file: urlparser.h
desc: parse a html page, get each url and its link text
author: chen hua
create: 2005-03-16
*/
#ifndef _URL_PARSER_H_
#define _URL_PARSER_H_
#include "basehtmlparser.h"
#include <map>
#include <string>
using namespace std;
class CURLParser: public CBaseHtmlParser
{
public:
CURLParser(){m_bInTagA=false;}
bool Parse(const string& URL,const string& Content);
public:
map<string,string> m_URL2Text;
private:
bool m_bInTagA;
string m_strCurURL;
void OnStartTag(const SZ_STRING & strTagName,vector< pair<SZ_STRING,SZ_STRING> > Attribs);
void OnEndTag(const SZ_STRING & strTagName);
void OnData(const SZ_STRING & strData);
};
#endif
#include "urlparser.h"
#ifndef WIN32
#define strnicmp strncasecmp
#endif
bool CURLParser::Parse(const string& URL,const string& Content)
{
m_URL2Text.clear();
return CBaseHtmlParser::Parse(URL,Content);
}
void CURLParser::OnStartTag(const SZ_STRING & strTagName,vector< pair<SZ_STRING,SZ_STRING> > Attribs)
{
if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))
{
m_bInTagA=true;
m_strCurURL.clear();
for(size_t i=0;i<Attribs.size();i++)
{
SZ_STRING x=Attribs[i].first;
if((Attribs[i].first.cbData==4)&&(strnicmp(Attribs[i].first.pbData,"href",Attribs[i].first.cbData)==0))
{
m_strCurURL=string(Attribs[i].second.pbData,Attribs[i].second.cbData);
Relativity2AbsoluteURL(m_strCurURL);
break;
}
}
}
}
void CURLParser::OnEndTag(const SZ_STRING & strTagName)
{
if((strTagName.cbData==1)&&(strnicmp(strTagName.pbData,"A",strTagName.cbData)==0))
{
m_bInTagA=false;
}
if((strTagName.cbData==2)&&(strnicmp(strTagName.pbData,"td",strTagName.cbData)==0))
{
m_bInTagA=false;
}
}
void CURLParser::OnData(const SZ_STRING & strData)
{
if(m_bInTagA)
{
if(!m_strCurURL.empty())
m_URL2Text[m_strCurURL]=m_URL2Text[m_strCurURL]+string(strData.pbData,strData.cbData);
}
}

有关Html Parser的使用范例， URL解析器

相关阅读

相关文章

相关问答

相关文档