Pajek的网络输入数据格式为*.net,而今天想下的一些数据都是*.gml格式的。
http://www-personal.umich.edu/~mejn/netdata/
Mark Newman在提供这些开源的数据的同时,提供了他写的一个简单GML分析器的C源码。我修改了一点,加了个主函数用来生成*.net文件。
如果*.gml文件提供更多信息,如节点的分类等,也可以修改这个简单的GML2Pajek来增加更多内容。
如果需要读取*.gml中更多信息,需要增加的地方:
1. 数据结构
2. 对应GML分析器中的函数
3. 将NETWORK输出到Pajek支持格式的函数(可能还要打开新文件,如*.vec)
我修改了Mark Newman提供的分析器的一部分。但是我认为Mark Newman放出这个程序的时候一定是有经过测试的,我不知道为什么我不修改就不能用了(不能转换他主页上提供的gml格式数据)。可能和一些我不知道的原因有关。
同时由于Newman提供的部分数据带有结点的value,我对每个*.gml都会生成对应的*.vec文件。如果*.gml不带有value,这个*.vec是多余的,我没有做判断。
下面是源代码,Newman提供的gml parser我做了一些修改。
gml2pajek.c
// Program to read a network stored in a GML file into a NETWORK struct
//
// Yi-Shi Lin 12 AUG 03
//
// A simple parser in C that will read the files into a data structure is needed
// It is provided by Mark Newman:
// http://www-personal.umich.edu/~mejn/netdata/readgml.zip
//
// If more infomation in *.gml is needed
// Please modify the data structure defined in network.h
// and the corresponding functions in readgml.c and gml2pajek.c
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "readgml.h"
#define FILENAMELEN 100
// Function to open *.gml and *.net
int open_file(FILE **gml_file, FILE **net_file, FILE **vec_file)
{
char filename[FILENAMELEN];
int len;
printf("\ninput *.gml\n");
scanf("%s", filename);
len=strlen(filename);
if (((*gml_file)=fopen(filename, "r"))==NULL)
{
printf("fail to open \"%s\"\n", filename);
return 0;
}
printf("%s has been opened...\n", filename);
filename[len-3]='n';
filename[len-2]='e';
filename[len-1]='t';
if (((*net_file)=fopen(filename, "w"))==NULL)
{
printf("fail to open \"%s\"\n", filename);
return 0;
}
filename[len-3]='v';
filename[len-2]='e';
filename[len-1]='c';
if (((*vec_file)=fopen(filename, "w"))==NULL)
{
printf("fail to open \"%s\"\n", filename);
return 0;
}
return 1;
}
// Function to close *.gml and *.net
void close_file(FILE *gml_file, FILE *net_file, FILE *vec_file)
{
fclose(gml_file);
fclose(net_file);
fclose(vec_file);
}
// Function to write the network into *.net
void write_network(NETWORK *network, FILE *net_file, FILE *vec_file)
{
int i,j;
fprintf(net_file, "*Vertices %d\n", network->nvertices);
for (i=0; i<network->nvertices; i++)
{
fprintf(net_file, "%d ", i+1);
if (network->vertex[i].label)
fprintf(net_file, "\"%s\"", network->vertex[i].label);
fprintf(net_file, "\n");
}
fprintf(net_file, "*Edges\n");
for (i=0; i<network->nvertices; i++)
{
for (j=0; j<network->vertex[i].degree; j++)
{
EDGE *tmp_edge=&(network->vertex[i].edge[j]);
fprintf(net_file, "%d %d %.lf\n", i+1, tmp_edge->target+1, tmp_edge->weight);
}
}
printf("*.net has been generated\n");
fprintf(vec_file, "*Vertices %d\n", network->nvertices);
for (i=0; i<network->nvertices; i++)
{
fprintf(vec_file, "%lf\n", network->vertex[i].value);
}
printf("*.vec has been generated\n");
}
int main()
{
NETWORK network;
FILE *gml_file, *net_file, *vec_file;
while (1)
{
if (open_file(&gml_file, &net_file, &vec_file)==0)
{
continue;
}
if (read_network(&network, gml_file)!=0)
{
printf("fail to read the network!\n");
continue;
}
write_network(&network, net_file, vec_file);
close_file(gml_file, net_file, vec_file);
free_network(&network);
}
return 0;
}
network.h
// Header file for VERTEX, EDGE, and NETWORK data structures
//
// Mark Newman 11 AUG 06
#ifndef _NETWORK_H
#define _NETWORK_H
typedef struct
{
int target; // Index in the vertex[] array of neighboring vertex.
// (Note that this is not necessarily equal to the GML
// ID of the neighbor if IDs are nonconsecutive or do
// not start at zero.)
double weight; // Weight of edge. 1 if no weight is specified.
} EDGE;
typedef struct
{
int id; // GML ID number of vertex
int degree; // Degree of vertex (out-degree for directed nets)
double value; // Value of vertex
char *label; // GML label of vertex. NULL if no label specified
EDGE *edge; // Array of EDGE structs, one for each neighbor
} VERTEX;
typedef struct
{
int nvertices; // Number of vertices in network
int directed; // 1 = directed network, 0 = undirected
char has_value;
VERTEX *vertex; // Array of VERTEX structs, one for each vertex
} NETWORK;
#endif
readgml.h
// Header file for the parser for GML files
//
// Mark Newman 11 AUG 06
#ifndef _READGML_H
#define _READGML_H
#include <stdio.h>
#include "network.h"
int read_network(NETWORK *network, FILE *stream);
void free_network(NETWORK *network);
#endif
readgml.c
// Functions to read a network stored in a GML file into a NETWORK struct
//
// Mark Newman 11 AUG 06
//
// To use this software, #include "readgml.h" at the head of your program
// and then call the following.
//
// Function calls:
// int read_network(NETWORK *network, FILE *stream)
// -- Reads a network from the FILE pointed to by "stream" into the
// structure "network". For the format of NETWORK structs see file
// "network.h". Returns 0 if read was successful.
// void free_network(NETWORK *network)
// -- Destroys a NETWORK struct again, freeing up the memory
// Inclusions
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "network.h"
// Constants
#define LINELENGTH 1000
// Types
typedef struct line
{
char *str;
struct line *ptr;
} LINE;
// Globals
LINE *first;
LINE *current;
// Function to read one line from a specified stream. Return value is
// 1 if an EOF was encountered. Otherwise 0.
int read_line(FILE *stream, char line[LINELENGTH])
{
if (fgets(line,LINELENGTH,stream)==NULL) return 1;
line[strlen(line)-1] = '\0'; // Erase the terminating NEWLINE
return 0;
}
// Function to read in the whole file into a linked-list buffer, so that we
// can do several passes on it, as required to read the GML format
// efficiently
int fill_buffer(FILE *stream)
{
int length;
char line[LINELENGTH];
LINE *previous;
if (stream==NULL || read_line(stream,line)!=0)
{
first = NULL;// Indicates empty buffer
return 1;
}
length = strlen(line) + 1;
first = malloc(sizeof(LINE));
first->str = malloc(length*sizeof(char));
strcpy(first->str,line);
previous = first;
while (read_line(stream,line)==0)
{
length = strlen(line) + 1;
previous->ptr = malloc(sizeof(LINE));
previous = previous->ptr;
previous->str = malloc(length*sizeof(char));
strcpy(previous->str,line);
}
previous->ptr = NULL; // Indicates last line
return 0;
}
// Function to free up the buffer again
void free_buffer()
{
LINE *thisptr;
LINE *nextptr;
thisptr = first;
while (thisptr!=NULL)
{
nextptr = thisptr->ptr;
free(thisptr->str);
free(thisptr);
thisptr = nextptr;
}
}
// Function to reset to the start of the buffer again
void reset_buffer()
{
current = first;
}
// Function to get the next line in the buffer. Returns 0 if there was
// a line or 1 if we've reached the end of the buffer.
int next_line(char line[LINELENGTH])
{
if (current==NULL) return 1;
strcpy(line,current->str);
current = current->ptr;
return 0;
}
// Function to establish whether the network read from a given stream is
// directed or not. Returns 1 for a directed network, and 0 otherwise. If
// the GML file contains no "directed" line then the graph is assumed to be
// undirected, which is the GML default behavior.
int is_directed()
{
int result=0;
char *ptr;
char line[LINELENGTH];
reset_buffer();
while (next_line(line)==0)
{
ptr = strstr(line,"directed");
if (ptr==NULL) continue;
sscanf(ptr,"directed %i",&result);
break;
}
return result;
}
// Function to count the vertices in a GML file. Returns number of vertices.
int count_vertices()
{
int result=0;
char *ptr;
char line[LINELENGTH];
reset_buffer();
while (next_line(line)==0) {
ptr = strstr(line,"node");
if (ptr!=NULL) {
ptr = strstr(line,"label \"");
if (ptr==NULL) result++;
}
}
return result;
}
// Function to compare the IDs of two vertices
int cmpid(VERTEX *v1p, VERTEX *v2p)
{
if (v1p->id>v2p->id) return 1;
if (v1p->id<v2p->id) return -1;
return 0;
}
// Function to allocate space for a network structure stored in a GML file
// and determine the parameters (id, label) of each of the vertices.
void create_network(NETWORK *network)
{
int i;
int length;
char *ptr;
char *start,*stop;
char line[LINELENGTH];
char label[LINELENGTH];
// Determine whether the network is directed
network->directed = is_directed();
// Count the vertices
network->nvertices = count_vertices();
// Make space for the vertices
network->vertex = calloc(network->nvertices,sizeof(VERTEX));
// Go through the file reading the details of each vertex one by one
reset_buffer();
for (i=0; i<network->nvertices; i++)
{
// Skip to next "node" entry
do
{
next_line(line);
}
while (strstr(line,"node")==NULL);
// Read in the details of this vertex
// initial value
network->vertex[i].value=0.0;
do
{
// Look for ID
ptr = strstr(line,"id");
if (ptr!=NULL) sscanf(ptr,"id %i",&network->vertex[i].id);
// Look for label
ptr = (strstr(line,"label"));
if (ptr!=NULL)
{
start = strchr(line,'"');
if (start==NULL)
{
sscanf(ptr,"label %s",&label);
}
else
{
stop = strchr(++start,'"');
if (stop==NULL) length = strlen(line) - (start-line);
else length = stop - start;
strncpy(label,start,length);
label[length] = '\0';
network->vertex[i].label = malloc((length+1)*sizeof(char));
strcpy(network->vertex[i].label,label);
}
}
// Look for value
ptr = (strstr(line, "value"));
if (ptr!=NULL)
{
sscanf(ptr, "value %lf", &network->vertex[i].value);
}
// If we see a closing square bracket we are done
if (strstr(line,"]")!=NULL) break;
}
while (next_line(line)==0);
}
// Sort the vertices in increasing order of their IDs so we can find them
// quickly later
qsort(network->vertex,network->nvertices,sizeof(VERTEX),(void*)cmpid);
}
// Function to find a vertex with a specified ID using binary search.
// Returns the element in the vertex[] array holding the vertex in question,
// or -1 if no vertex was found.
int find_vertex(int id, NETWORK *network)
{
int top,bottom,split;
int idsplit;
top = network->nvertices;
if (top<1) return -1;
bottom = 0;
split = top/2;
do
{
idsplit = network->vertex[split].id;
if (id>idsplit)
{
bottom = split + 1;
split = (top+bottom)/2;
}
else if (id<idsplit)
{
top = split;
split = (top+bottom)/2;
}
else return split;
}
while (top>bottom);
return -1;
}
// Function to determine the degrees of all the vertices by going through
// the edge data
void get_degrees(NETWORK *network)
{
int s,t;
int vs,vt;
char *ptr;
char line[LINELENGTH];
reset_buffer();
while (next_line(line)==0)
{
// Find the next edge entry
ptr = strstr(line,"edge");
if (ptr==NULL) continue;
// Read the source and target of the edge
s = t = -1;
do
{
ptr = strstr(line,"source");
if (ptr!=NULL) sscanf(ptr,"source %i",&s);
ptr = strstr(line,"target");
if (ptr!=NULL) sscanf(ptr,"target %i",&t);
// If we see a closing square bracket we are done
if (strstr(line,"]")!=NULL) break;
}
while (next_line(line)==0);
// Increment the degrees of the appropriate vertex or vertices
if ((s>=0)&&(t>=0))
{
vs = find_vertex(s,network);
network->vertex[vs].degree++;
if (network->directed==0)
{
vt = find_vertex(t,network);
network->vertex[vt].degree++;
}
}
}
return;
}
// Function to read in the edges
void read_edges(NETWORK *network)
{
int i;
int s,t;
int vs,vt;
int *count;
double w;
char *ptr;
char line[LINELENGTH];
// Malloc space for the edges and temporary space for the edge counts
// at each vertex
for (i=0; i<network->nvertices; i++)
{
network->vertex[i].edge = malloc(network->vertex[i].degree*sizeof(EDGE));
}
count = calloc(network->nvertices,sizeof(int));
// Read in the data
reset_buffer();
while (next_line(line)==0)
{
// Find the next edge entry
ptr = strstr(line,"edge");
if (ptr==NULL) continue;
// Read the source and target of the edge and the edge weight
s = t = -1;
w = 1.0;
do
{
ptr = strstr(line,"source");
if (ptr!=NULL) sscanf(ptr,"source %i",&s);
ptr = strstr(line,"target");
if (ptr!=NULL) sscanf(ptr,"target %i",&t);
ptr = strstr(line,"value");
if (ptr!=NULL) sscanf(ptr,"value %lf",&w);
// If we see a closing square bracket we are done
if (strstr(line,"]")!=NULL) break;
}
while (next_line(line)==0);
// Add these edges to the appropriate vertices
if ((s>=0)&&(t>=0))
{
vs = find_vertex(s,network);
vt = find_vertex(t,network);
network->vertex[vs].edge[count[vs]].target = vt;
network->vertex[vs].edge[count[vs]].weight = w;
count[vs]++;
if (network->directed==0)
{
network->vertex[vt].edge[count[vt]].target = vs;
network->vertex[vt].edge[count[vt]].weight = w;
count[vt]++;
}
}
}
free(count);
return;
}
// Function to read a complete network
int read_network(NETWORK *network, FILE *stream)
{
fill_buffer(stream);
create_network(network);
get_degrees(network);
read_edges(network);
free_buffer();
return 0;
}
// Function to free the memory used by a network again
void free_network(NETWORK *network)
{
int i;
for (i=0; i<network->nvertices; i++)
{
free(network->vertex[i].edge);
free(network->vertex[i].label);
}
free(network->vertex);
}