当前位置: 首页 > 工具软件 > Web-Harvest > 使用案例 >

Web-Harvet(2)

苏淇
2023-12-01

Analysis csdn


====== search criteria

<div class="interact">
<a href="http://my.csdn.net/my/letter/send/cping1982" class="letter" title="[发私信]"></a>
<!--<a href="#" class="attented" title="已关注"></a>-->
<a href="#" class="attent" id="span_add_follow" title="[加关注]"></a>
</div>
        <div id="blog_medal">

        </div>
        <ul id="blog_rank">
            <li>访问:<span>1103783次</span></li>
            <li>积分:<span>16734分</span></li>
            <li>排名:<span>第72名</span></li>
        </ul>

        <ul id="blog_statistics">
            <li>原创:<span>310篇</span></li>
            <li>转载:<span>65篇</span></li>
            <li>译文:<span>3篇</span></li>
            <li>评论:<span>4894条</span></li>

        </ul>
    </ul>
</div>


====
<div id="panel_Category" class="panel">
    <ul class="panel_head"><span>文章分类</span></ul>

    <ul class="panel_body">
        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/447055">JAVA应用</a><span>(79)</span>
        
        </li>
        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/455610">JAVA游戏开发</a><span>(58)</span>
        
        </li>

        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/403546">原创小说</a><span>(46)</span>
        
        </li>
        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/496919">技术文章</a><span>(61)</span>
        
        </li>
        <li>

        <a href="http://blog.csdn.net/cping1982/article/category/634934">杂文杂记</a><span>(50)</span>
        
        </li>
        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/530863">转载文章</a><span>(7)</span>
        
        </li>
        <li>
        <a href="http://blog.csdn.net/cping1982/article/category/865921">Android移植</a><span>(3)</span>

        
        </li>
    </ul>
</div>
===


<div id="hotarticls" class="panel">
    <ul class="panel_head"><span>阅读排行</span></ul>
    <ul class="panel_body">
        <li>
            <a href="/cping1982/article/details/2166968" title="ExtJS2.0开发与实践笔记[0]——初识ExtJS">ExtJS2.0开发与实践笔记[0]——...</a> (31772)
        </li>

        <li>
            <a href="/cping1982/article/details/6176191" title="Android游戏框架Libgdx使用入门">Android游戏框架Libgdx使用入...</a> (27603)
        </li>
        <li>
            <a href="/cping1982/article/details/6460357" title="浅谈2011年上半年Java游戏领域动态">浅谈2011年上半年Java游戏领域动态</a> (25388)
        </li>
        <li>
            <a href="/cping1982/article/details/5186072" title="未睹棺椁先哭君——谷歌墓志铭">未睹棺椁先哭君——谷歌墓志铭</a> (22452)
        </li>

        <li>
            <a href="/cping1982/article/details/6227775" title="Android游戏框架AndEngine使用入门">Android游戏框架AndEngine...</a> (21613)
        </li>
        <li>
            <a href="/cping1982/article/details/1931539" title="浅谈java.util.concurrent包的并发处理">浅谈java.util.concurre...</a> (17942)
        </li>
        <li>
            <a href="/cping1982/article/details/6072188" title="为什么没有好用的Android游戏引擎?">为什么没有好用的Android游戏引擎?</a> (17522)
        </li>

        <li>
            <a href="/cping1982/article/details/2806598" title="Java&.Net虚拟机精简(GreenJVM&GreenDotNet发布)">Java&.Net虚拟机精简(Green...</a> (16850)
        </li>
        <li>
            <a href="/cping1982/article/details/6006760" title="Android游戏开发示例——弹幕+战棋">Android游戏开发示例——弹幕+战棋</a> (16316)
        </li>
        <li>

            <a href="/cping1982/article/details/1869430" title="中国本土化编程(汉语编程)之我见">中国本土化编程(汉语编程)之我见</a> (15610)
        </li>
    </ul>
</div>


= one item 

    <div class="list_item article_item">
        <div class="article_title">

    <span class="ico ico_type_Repost"></span>
    <h3>
        <span class="link_title"><a href="/garyyding/article/details/7063265">
        Learn JOGL
        </a></span>
    </h3>
</div>

        <div class="article_description">
wei495715356 has some introduce for Nehe
http://www.iteye.com/topic/671095


HCQmaker has some course for OpenGL
http://hcqmaker.iteye.com/blog/241320


wjyjimy has some course for OpenGL

h...        </div>

        <div class="article_manage">
    <span class="link_postdate">2011-12-12 13:29</span>
    <span class="link_view" title="阅读次数"><a href="/garyyding/article/details/7063265" title="阅读次数">阅读</a>(27)</span>
    <span class="link_comments" title="评论次数"><a href="/garyyding/article/details/7063265#comments" title="评论次数">评论</a>(0)</span>
    
    <span class="link_edit"><a href="http://write.blog.csdn.net/postedit/7063265" title="编辑">编辑</a></span>
    <span class="link_delete"><a href="javascript:void(0);" οnclick="javascript:deleteArticle(7063265);return false;" title="删除">删除</a></span>

</div>


Code of web-Harvest

<?xml version="1.0" encoding="UTF-8"?>

<!-- Expects following initial variable: search - search expression -->

<config charset="UTF-8">
    <include path="functions.xml" />

    <!-- defines search keyword and start URL -->
    <var-def name="searchResultObject" overwrite="false">
        <template>${searchResult.getWrappedObject()}</template>
    </var-def>
    <var-def name="currentUser" overwrite="false">
        <template>${searchResult.getWrappedObject().user}</template>
    </var-def>
    <var-def name="targetWebsite" overwrite="false">
        <template>${searchResult.getWrappedObject().rootWebSite}/${searchResult.getWrappedObject().user}
        </template>
    </var-def>
    <var-def name="rootWebsite" overwrite="false">
        <template>${searchResult.getWrappedObject().rootWebSite}</template>
    </var-def>

    <var-def name="doc">
        <html-to-xml>
            <http url="${targetWebsite}" />
        </html-to-xml>
    </var-def>
    <var-def name="interact">
        <xpath expression="//ul[@class='panel_body profile']">
            <var name="doc" />
        </xpath>
    </var-def>
    <var-def name="categories">
        <xpath expression="//div[@id='panel_Category']/ul[@class='panel_body']/li">
            <var name="doc" />
        </xpath>
    </var-def>
    <var-def name="fileArchive">
        <xpath expression="//div[@id='panel_Archive']/ul[@class='panel_body']/div[@id='archive_list']/li">
            <var name="doc" />
        </xpath>
    </var-def>
    <var-def name="interact_fangwen">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_rank']/li[1]</call-param>
        </call>
    </var-def>
    <var-def name="interact_jifen">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_rank']/li[2]</call-param>
        </call>
    </var-def>
    <var-def name="interact_paiming">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_rank']/li[3]</call-param>
        </call>
    </var-def>
    <var-def name="interact_original">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_statistics']/li[1]
            </call-param>
        </call>
    </var-def>
    <var-def name="interact_get">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_statistics']/li[2]
            </call-param>
        </call>
    </var-def>
    <var-def name="interact_translate">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_statistics']/li[3]
            </call-param>
        </call>
    </var-def>
    <var-def name="interact_comment">
        <call name="regexp">
            <call-param name="rule">[\d]+</call-param>
            <call-param name="content">
                <var name="interact" />
            </call-param>
            <call-param name="itemPath">//ul[@id='blog_statistics']/li[4]
            </call-param>
        </call>
    </var-def>

    <script><![CDATA[           
        Object o=searchResult.getWrappedObject();   
        o.addProfile(interact_jifen.toString(),interact_paiming.toString(),interact_fangwen.toString(),interact_original.toString(),interact_get.toString(),interact_translate.toString(),interact_comment.toString()); 
                                         
    ]]></script>

    <loop item="category" index="i" filter="unique">
        <list>
            <var name="categories" />
        </list>
        <body>
            <var-def name="title">
                <xpath expression="//a/text()">
                    <var name="category" />
                </xpath>
            </var-def>
            <var-def name="u">
                <xpath expression="//a/@href[1]">
                    <var name="category" />
                </xpath>
            </var-def>
            <script><![CDATA[           
                 Object o=searchResult.getWrappedObject();               
                 o.addFileCategory(title.toString(),u.toString());                         
             ]]></script>                       
            <var-def name="category_doc">
                <xpath expression="//div[@class='list_item article_item']">
                    <html-to-xml>
                        <http url="${u}" />
                    </html-to-xml>
                </xpath>
            </var-def>
            <loop item="onecategory" index="j" filter="unique">
                <list>
                    <var name="category_doc" />
                </list>
                <body>
                    <var-def name="f_datetime">
                        <xpath
                            expression="//div[@class='article_manage']/span[@class='link_postdate']/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_link_view">
                        <call name="regexp">
                            <call-param name="rule">[\d]+</call-param>
                            <call-param name="content"><var name="onecategory" /></call-param>
                            <call-param name="itemPath">//div[@class='article_manage']/span[@class='link_view']/text()</call-param>
                        </call>
                    </var-def>
                    <var-def name="f_link_comments">
                        <call name="regexp">
                            <call-param name="rule">[\d]+</call-param>
                            <call-param name="content"><var name="onecategory" /></call-param>
                            <call-param name="itemPath">//div[@class='article_manage']/span[@class='link_comments']/text()</call-param>
                        </call> 
                    </var-def>
                    <var-def name="f_name">
                        <xpath
                            expression="//div[@class='article_title']/h3/span[@class='link_title']/a/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_url">
                        <xpath
                            expression="//div[@class='article_title']/h3/span[@class='link_title']/a/@href">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_description">
                        <xpath expression="//div[@class='article_description']/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <script><![CDATA[
                       Object o=searchResult.getWrappedObject();
                       o.addFileIntoCategory(f_name.toString(),rootWebsite.toString()+f_url.toString(),title.toString(),f_datetime.toString(),f_link_view.toString(),f_link_comments.toString(),f_description.toString());  
                    ]]></script>
                </body>
            </loop>
        </body>
    </loop>
  
  <loop item="category" index="i" filter="unique">
        <list>
            <var name="fileArchive" />
        </list>
        <body>
            <var-def name="title">
                <xpath expression="//a/text()">
                    <var name="category" />
                </xpath>
            </var-def>
            <var-def name="u" overwrite="true">
                <xpath expression="//a/@href[1]">
                    <var name="category" />
                </xpath>
            </var-def>
            <script><![CDATA[           
                 Object o=searchResult.getWrappedObject();               
                 o.addFileArchive(title.toString(),u.toString());     
                                      
             ]]></script>                       
            <var-def name="category_doc">
                <xpath expression="//div[@class='list_item article_item']">
                    <html-to-xml>
                        <http url="${u}" />
                    </html-to-xml>
                </xpath>
            </var-def>
            <loop item="onecategory" index="j" filter="unique">
                <list>
                    <var name="category_doc" />
                </list>
                <body>
                    <var-def name="f_datetime">
                        <xpath
                            expression="//div[@class='article_manage']/span[@class='link_postdate']/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_link_view">
                        <call name="regexp">
                            <call-param name="rule">[\d]+</call-param>
                            <call-param name="content"><var name="onecategory" /></call-param>
                            <call-param name="itemPath">//div[@class='article_manage']/span[@class='link_view']/text()</call-param>
                        </call>
                    </var-def>
                    <var-def name="f_link_comments">
                        <call name="regexp">
                            <call-param name="rule">[\d]+</call-param>
                            <call-param name="content"><var name="onecategory" /></call-param>
                            <call-param name="itemPath">//div[@class='article_manage']/span[@class='link_comments']/text()</call-param>
                        </call> 
                    </var-def>
                    <var-def name="f_name">
                        <xpath
                            expression="//div[@class='article_title']/h3/span[@class='link_title']/a/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_url">
                        <xpath
                            expression="//div[@class='article_title']/h3/span[@class='link_title']/a/@href">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <var-def name="f_description">
                        <xpath expression="//div[@class='article_description']/text()">
                            <var name="onecategory" />
                        </xpath>
                    </var-def>
                    <script><![CDATA[
                       Object o=searchResult.getWrappedObject();
                       o.addFileIntoFileArchives(f_name.toString(),rootWebsite.toString()+f_url.toString(),title.toString(),f_datetime.toString(),f_link_view.toString(),f_link_comments.toString(),f_description.toString());  
                    ]]></script>
                </body>
            </loop>
        </body>
    </loop>
    <script><![CDATA[   
       SetContextVar("categories", searchResult.getWrappedObject().getCategories());       
       SetContextVar("fileArchive", searchResult.getWrappedObject().getFileArchives());                                        
    ]]></script>
  
    <file action="write" path="csdn_${currentUser}.xml" charset="UTF-8">
        <template>
            <![CDATA[ <user name="${currentUser}"> <categories>]]>
        </template>
        <loop item="category" index="i" filter="unique">
            <list>
                <var name="categories" />
            </list>
            <body>
                <template><![CDATA[ 
                   <category name="${category.getWrappedObject().getName()}" url="${category.getWrappedObject().getUrl()}"> ]]>
                </template>
                <script><![CDATA[   
                   SetContextVar("files", category.getWrappedObject().getFiles());                                            
                ]]></script>

                <loop item="file" index="i" filter="unique">
                    <list>
                        <var name="files" />
                    </list>
                    <body>
                        <template><![CDATA[ 
                           <article name="${file.getWrappedObject().getFileName()}" url="${file.getWrappedObject().getFileUrl()}" dateTime="${file.getWrappedObject().getFileDateTime()}" read="${file.getWrappedObject().getCount_read()}" comments="${file.getWrappedObject().getCount_comment()}">
                           <description>
                           ${file.getWrappedObject().getDescription()}                           
                           </description>                           
                           </article> 
                         ]]></template>
                    </body>
                </loop>
                <template><![CDATA[ </category> ]]></template>
            </body>
        </loop> 

        <![CDATA[ </categories><archives> ]]>
        <![CDATA[ <archives> ]]>
        <loop item="category" index="i" filter="unique">
            <list>
                <var name="fileArchive" />
            </list>
            <body>
                <template><![CDATA[ 
                   <archive name="${category.getWrappedObject().getName()}" url="${category.getWrappedObject().getUrl()}"> ]]>
                </template>
                <script><![CDATA[   
                   SetContextVar("files", category.getWrappedObject().getFiles());                                            
                ]]></script>

                <loop item="file" index="i" filter="unique">
                    <list>
                        <var name="files" />
                    </list>
                    <body>
                        <template><![CDATA[ 
                           <article name="${file.getWrappedObject().getFileName()}" url="${file.getWrappedObject().getFileUrl()}" dateTime="${file.getWrappedObject().getFileDateTime()}" read="${file.getWrappedObject().getCount_read()}" comments="${file.getWrappedObject().getCount_comment()}"/>
                         ]]></template>
                    </body>
                </loop>
                <template><![CDATA[ </archive> ]]></template>

            </body>
        </loop>         
        <![CDATA[ </archives></user> ]]>
    </file>

</config>

functions.xml


<?xml version="1.0" encoding="UTF-8"?>

<config>
	<!-- Download multi-page list of items. @param pageUrl - URL of starting 
		page @param itemXPath - XPath expression to obtain single item in the list 
		@param nextXPath - XPath expression to URL for the next page @param maxloops 
		- maximum number of pages downloaded @return list of all downloaded items -->
	<function name="download-multipage-list">
		<return>
			<while condition="${pageUrl.toString().length() != 0}"
				maxloops="${maxloops}" index="i">
				<empty>
					<var-def name="content">
						<html-to-xml>
							<http url="${pageUrl}" />
						</html-to-xml>
					</var-def>

					<var-def name="nextLinkUrl">
						<xpath expression="${nextXPath}">
							<var name="content" />
						</xpath>
					</var-def>

					<var-def name="pageUrl">
						<template>${sys.fullUrl(pageUrl.toString(),
							nextLinkUrl.toString())}</template>
					</var-def>
				</empty>

				<xpath expression="${itemXPath}">
					<var name="content" />
				</xpath>
			</while>
		</return>
	</function>
	
	<function name="regexp">
    <script><![CDATA[
                                 
    ]]></script>	
		<return>
			<regexp>
				<regexp-pattern><var name="rule" /></regexp-pattern>
				<regexp-source>
					<xpath expression="${itemPath}">
						<var name="content" />
					</xpath>
				</regexp-source>
				<regexp-result>
					<template>${_0}</template>
				</regexp-result>
			</regexp>
		</return>
	</function>	
	
</config>



output

<user name="garyyding"> <categories>
<category name="OpenGL" url="http://blog.csdn.net/garyyding/article/category/950136">
<article name="Learn JOGL" url="http://blog.csdn.net/garyyding/article/details/7063265" dateTime="2011-12-12 13:29" read="30" comments="0">
<description>
wei495715356 has some introduce for Nehe
http://www.iteye.com/topic/671095


HCQmaker has some course for OpenGL
http://hcqmaker.iteye.com/blog/241320


wjyjimy has some course for OpenGL

h...
</description>
</article>
<article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0">
<description>
It is a good place to learn OpenGL (Java )


NeHe
http://nehe.gamedev.net/


NeHe demo explaination

Lessons 01 - 05Lessons 06 - 10Lessons 11 - 15Lessons 16 - 20Lessons 21 - 25Lessons 26 - 30...
</description>
</article>
</category>
<category name="Game" url="http://blog.csdn.net/garyyding/article/category/950137">
<article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0">
<description>
It is a good place to learn OpenGL (Java )


NeHe
http://nehe.gamedev.net/


NeHe demo explaination

Lessons 01 - 05Lessons 06 - 10Lessons 11 - 15Lessons 16 - 20Lessons 21 - 25Lessons 26 - 30...
</description>
</article>
</category>
<category name="Other" url="http://blog.csdn.net/garyyding/article/category/951538">
<article name="free svn repository -- www.assembla.com" url="http://blog.csdn.net/garyyding/article/details/7162293" dateTime="2011-12-29 15:43" read="22" comments="0">
<description>
A article has introduced some free svn repositories.
https://www.assembla.com/user/one_page_signup/software_developers_integrated?space_type=catalog
 
I have tried assembla. It is good.
 
My repo...
</description>
</article>
<article name="My PMP" url="http://blog.csdn.net/garyyding/article/details/7069177" dateTime="2011-12-14 09:37" read="22" comments="0">
<description>
https://my.pmi.org/
login as gary.ding


report PDU
https://ccrs.pmi.org/Certificants/ClaimPDU.aspx
Input provider code, you can select course, for example 2858,2854...
</description>
</article>
</category>
</categories><archives>
<archives>
<archive name="2012年03月" url="http://blog.csdn.net/garyyding/article/month/2012/03">
<article name="Web-Harvest(1)" url="http://blog.csdn.net/garyyding/article/details/7409845" dateTime="2012-03-30 09:14" read="8" comments="0"/>
<article name="Use Web-Harvest to data-extract from www.vdisk.cn" url="http://blog.csdn.net/garyyding/article/details/7361178" dateTime="2012-03-16 15:34" read="24" comments="0"/>
</archive>
<archive name="2011年12月" url="http://blog.csdn.net/garyyding/article/month/2011/12">
<article name="free svn repository -- www.assembla.com" url="http://blog.csdn.net/garyyding/article/details/7162293" dateTime="2011-12-29 15:43" read="22" comments="0"/>
<article name="My PMP" url="http://blog.csdn.net/garyyding/article/details/7069177" dateTime="2011-12-14 09:37" read="22" comments="0"/>
<article name="Learn JOGL" url="http://blog.csdn.net/garyyding/article/details/7063265" dateTime="2011-12-12 13:29" read="30" comments="0"/>
<article name="Learn Java OpenGL from NeHe ( Jogl 1.1.2)" url="http://blog.csdn.net/garyyding/article/details/7063205" dateTime="2011-12-12 13:15" read="53" comments="0"/>
</archive>
<archive name="2011年10月" url="http://blog.csdn.net/garyyding/article/month/2011/10">
<article name="Install Android4.0" url="http://blog.csdn.net/garyyding/article/details/6890177" dateTime="2011-10-20 14:24" read="87" comments="0"/>
</archive>
<archive name="2010年04月" url="http://blog.csdn.net/garyyding/article/month/2010/04">
<article name="Mobile phone development" url="http://blog.csdn.net/garyyding/article/details/5508757" dateTime="2010-04-20 22:39" read="23" comments="0"/>
</archive>
<archive name="2009年11月" url="http://blog.csdn.net/garyyding/article/month/2009/11">
<article name="google app engine" url="http://blog.csdn.net/garyyding/article/details/4866348" dateTime="2009-11-24 20:41" read="49" comments="0"/>
<article name="JbossServer5.1GA isolated for 多个 EAR" url="http://blog.csdn.net/garyyding/article/details/4828094" dateTime="2009-11-18 13:50" read="39" comments="0"/>
</archive>
<archive name="2008年12月" url="http://blog.csdn.net/garyyding/article/month/2008/12">
<article name="关于appfuse" url="http://blog.csdn.net/garyyding/article/details/3514216" dateTime="2008-12-14 09:54" read="40" comments="0"/>
</archive>
</archives></user>



 类似资料: