当前位置: 首页 > 工具软件 > WebGraph > 使用案例 >

将WebGraph合并为HostGraph

拓拔烨赫
2023-12-01

law实验室提供很多的webgraph,但是没有提供相应的hostgraph。所谓hostgraph就是将webgraph中在同一站点的url合成一个结点。注意到这些webgraph中在同一个站点中的url是连续的,这对我们进行合并提供了很大的方便性。本来想用java来写,但是考虑到java的io效率在windows下比较差,就用C#了。我用IKVM将webgraph.jar和其依赖的jar文件打包成webgraph.dll。

 

合并算法相对比较简单,分为两步:第一步扫描url文件,建立相应结点的对应关系。第二步,读取webgraph进行合并,并声称hostgraph。

 

代码如下:

 
 
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using it.unimi.dsi.webgraph;
using org.apache.log4j;
 
 
namespace indochina_2004host
{
class Program
{
static string basename = @"D:研究数据集law datasetsuk-2007-05uk-2007-05";
static Logger logger;
static bool offline = false;
static Program()
{
Logger.getRootLogger().addAppender(new ConsoleAppender(new TTCCLayout(), ConsoleAppender.SYSTEM_OUT));
logger = Logger.getLogger(typeof(Program));
}
 
 
static void Main(string[] args)
{
MergeUrl();
MergeGraph();
}
 
 
static void MergeGraph()
{
logger.info("start merge graph");
string[] maps = File.ReadAllLines(basename + ".map");
BVGraph bg;
logger.info("loading graph");
if (offline)
bg = BVGraph.loadOffline(basename);
else
bg = BVGraph.load(basename);
 
 
using (StreamWriter sw = new StreamWriter(basename + ".hostgraph.graph-txt"))
{
NodeIterator it = bg.nodeIterator();
// 生成一个map
logger.info("generate map");
int[] map = new int[bg.numNodes()];
int[] start = new int[maps.Length];
int[] end = new int[maps.Length];
 
 
for (int k = 0; k < maps.Length; k ++)
{
string line = maps[k];
string[] splits = line.Split(' ', '-');
int value = Convert.ToInt32(splits[2]);
int st = Convert.ToInt32(splits[0]);
int en = Convert.ToInt32(splits[1]);
start[k] = st;
end[k] = en;
for (int i = st; i <= en; i++)
map[i] = value;
maps[k] = null;
}
// 回收maps
maps = null;
GC.Collect();
logger.info("map length: " + start.Length);
sw.WriteLine(start.Length);
// 开始合并
logger.info("merging...");
 
 
for (int k = 0; k < start.Length; k ++)
{
logger.info("merging " + start[k] + "-" + end[k] + ": " + map[start[k]]);
SortedSet<int> successors = new SortedSet<int>();
 
 
for (int i = start[k]; i <= end[k]; i++)
{
it.nextInt();
LazyIntIterator lit = it.successors();
int j;
while ((j = lit.nextInt()) != -1)
successors.Add(j);
}
SortedSet<int> after = new SortedSet<int>();
foreach (int successor in successors)
after.Add(map[successor]);
after.Remove(k);
int[] ts = after.ToArray();
for (int i = 0; i < ts.Length - 1; i++)
{
sw.Write(ts[i]);
sw.Write(' ');
}
if (ts.Length != 0)
sw.WriteLine(ts[ts.Length - 1]);
else
sw.WriteLine();
}
sw.Flush();
}
logger.info("end merge graph");
}
 
 
static void MergeUrl()
{
logger.info("start merge url");
using (StreamReader urlSr = new StreamReader(basename + ".urls"))
using (StreamWriter hostnamesSw = new StreamWriter(basename + ".hostnames.txt"))
using (StreamWriter mapSw = new StreamWriter(basename + ".map"))
{
long i = 0;
long j = 0;
long k = 0;
string host = null;
string url = null;
while (!urlSr.EndOfStream)
{
url = urlSr.ReadLine();
string curhost = url.ToLower().Replace("http://", "");
curhost = curhost.Substring(0, curhost.IndexOf('/')).Trim();
if (host != curhost)
{
if (host != null)
{
hostnamesSw.WriteLine(host);
mapSw.WriteLine(j + "-" + (i - 1) + " " + k);
k++;
j = i;
}
host = curhost;
}
i++;
}
hostnamesSw.WriteLine(host);
mapSw.WriteLine(j + "-" + (i-1) + " " + k);
hostnamesSw.Flush();
mapSw.Flush();
}
logger.info("end merge url");
}
}
}
 
 类似资料: