Ragel是一个状态机编译器,类似Lex,主要是用来处理字符输入,用于语法解析。简单的文本处理工作一般用正则表达式,或者用awk/sed这些工具就可以处理了,之所以使用Ragel是为了当你的代码的核心任务是解析文本,而且需要高效地处理数据,比如一个SMTP引擎,HTTP引擎,那么Ragel可以按你定义好的语法,生成一个状态机嵌入到你的代码中。因为这个状态机是专门针对你预定义的语法,且以你的原生代码执行,效率自然比正则表达式,awk这些通用工具高的多(据说媲美汇编)。Ragel支持生成C/C++/Java/Ruby/D/C#等各种语言。
大家可以去官网自行下载最新的Ragel源代码,官网地址 点击这里
这里也给出一份已经下载好的Ragel源代码,大家也可以在这里下载
ragel-6.10
colm-0.13.0.7
ragel官方使用指南pdf
安装使用Ragel也需要colm,所以上面的请全部下载下来
输入 tar xvf colm-0.13.0.7.tar.gz,解压colm
输入 cd colm-0.13.0.7,进入源码目录
输入 yum install libtool gcc g++ autoconf automake,安装编译所需组件
输入 ./configure,执行配置操作
输入 make,编译源代码
输入 make install,将colm安装到系统上
输入 tar xvf ragel-6.10.tar.gz,解压ragel
输入 cd ragel-6.10,进入源码目录
输入 ./configure,执行配置操作
输入 make,编译源代码
输入 make install,将Ragel安装到系统上
#include <iostream>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/time.h>
%%{
machine parser;
action save_symbol
{
if (*(fpc - 1) == '-')
is_negative = true;
}
action save_number
{
number = number * 10 + *(fpc - 1) - '0';
}
main := ("+" | "-" %save_symbol)? (digit %save_number)+;
write data;
}%%
uint64_t getCurrentMS()
{
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec * 1000ul + t.tv_usec / 1000;
}
void test_myatoi(int argc, char* argv[])
{
if (argc < 2)
{
std::cout << "Please input integer number" << std::endl;
return;
}
bool is_negative = false;
int64_t number = 0;
int cs;
%% write init;
const char *p = argv[1];
const char *pe = p + strlen(p);
const char* eof = pe;
%% write exec;
if (cs == parser_error)
{
std::cout << "Parse fail, please input integer number" << std::endl;
}
else
{
if (is_negative)
number = -number;
std::cout << "Parse successful, input number=" << number << std::endl;
}
}
void test_efficiency()
{
const char* str = "865466464";
int64_t number;
uint64_t start_time = getCurrentMS();
for (size_t i = 0; i < 1000000000; i++)
{
number = atol(str);
}
std::cout << "execute system atoi need " << getCurrentMS() - start_time << "ms" << std::endl;
start_time = getCurrentMS();
for (size_t i = 0; i < 1000000000; i++)
{
bool is_negative = false;
int64_t number = 0;
int cs;
%% write init;
const char *p = str;
const char *pe = p + strlen(p);
const char* eof = pe;
%% write exec;
if (cs == parser_error)
{
std::cout << "Parse fail" << std::endl;
}
}
std::cout << "execute self atoi need " << getCurrentMS() - start_time << "ms" << std::endl;
}
int main(int argc, char* argv[])
{
test_myatoi(argc, argv);
test_efficiency();
return 0;
}
这里先直接贴出测试代码,在文章最后也会给出一份测试文件的,以下是代码关键部分的解析
第7行:%%{ 表示Ragel代码块开始
第8行:machine定义一个状态机
第10、15行:表示一个action动作,可以在指定位置解析完成调用该动作
第20行:main := (“+” | “-” %save_symbol)? (digit %save_number)+ 定义正则表达式,main是个关键字,表示状态机入口
第21行:write data 指示Ragel在代码的这个位置写入ragel运行需要的静态数据
第22行:}%% 表示Ragel代码块结束
第76行:write init 指示Ragel在代码的这个位置写入Ragel运行需要的初始化代码
第77、78行:变量p和pe表示状态机处理的buffer起始和终止地址,状态机运行时就从这个buffer依次读入字符,变量名很重要,必须为p和pe,因为Ragel生成的代码里面使用这两个变量名
第80行:指示Ragel在代码的这个位置写入运行状态机的代码
接下来对最重要的正则语句进行解释
main := (“+” | “-” %save_symbol)? (digit %save_number)+
编译运行之后可以看到Ragel编译出来的atoi比系统的atoi函数要快3到5倍,效率还是很可观的
官方uri说明文档 点击这里
foo://user@example.com:8042/over/there?name=ferret#nose
\_/ \___________________/\_________/ \_________/ \__/
| | | | |
scheme authority path query fragment
| _____________________|__
/ \ / \
urn:example:animal:ferret:nose
最后解析也是需要对uri结构的各个部分进行分块解析的
如下所示,各个action用于存储uri各个部分的信息
action save_scheme
{
uri->setScheme(std::string(mark, fpc - mark));
mark = NULL;
}
action save_userinfo
{
uri->setUserinfo(std::string(mark, fpc - mark));
mark = NULL;
}
action save_host
{
uri->setHost(std::string(mark, fpc - mark));
mark = NULL;
}
action save_path
{
uri->setPath(std::string(mark, fpc - mark));
mark = NULL;
}
action save_port
{
if (fpc != mark)
{
uri->setPort(atoi(mark));
}
mark = NULL;
}
action save_query
{
uri->setQuery(std::string(mark, fpc - mark));
mark = NULL;
}
action save_fragment
{
uri->setFragment(std::string(mark, fpc - mark));
mark = NULL;
}
由于官方已进给出uri的正则解析表达式了,我们就没必要自己去设计了,但是官方的解析器和Ragel有部分区别,接下来对那部分不一样的进行分析,大部分的内容还是得大家自行去看说明文档
有了以上的铺垫,我们就可以比较轻松地将官方的正则移植为Ragel格式的代码了,如下所示
%%{
# See RFC 3986: http://www.ietf.org/rfc/rfc3986.txt
machine uri_parser;
gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@";
sub_delims = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=";
reserved = gen_delims | sub_delims;
unreserved = alpha | digit | "-" | "." | "_" | "~";
pct_encoded = "%" xdigit xdigit;
action marku { mark = fpc; }
action save_scheme
{
uri->setScheme(std::string(mark, fpc - mark));
mark = NULL;
}
action save_userinfo
{
uri->setUserinfo(std::string(mark, fpc - mark));
mark = NULL;
}
action save_host
{
uri->setHost(std::string(mark, fpc - mark));
mark = NULL;
}
action save_path
{
uri->setPath(std::string(mark, fpc - mark));
mark = NULL;
}
action save_port
{
if (fpc != mark)
{
uri->setPort(atoi(mark));
}
mark = NULL;
}
action save_query
{
uri->setQuery(std::string(mark, fpc - mark));
mark = NULL;
}
action save_fragment
{
uri->setFragment(std::string(mark, fpc - mark));
mark = NULL;
}
scheme = (alpha (alpha | digit | "+" | "-" | ".")*) >marku %save_scheme;
userinfo = (unreserved | pct_encoded | sub_delims | ":")*;
dec_octet = digit | [1-9] digit | "1" digit{2} | 2 [0-4] digit | "25" [0-5];
IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet;
h16 = xdigit{1,4};
ls32 = (h16 ":" h16) | IPv4address;
IPv6address = ( (h16 ":"){6} ls32) |
( "::" (h16 ":"){5} ls32) |
(( h16)? "::" (h16 ":"){4} ls32) |
(((h16 ":"){1} h16)? "::" (h16 ":"){3} ls32) |
(((h16 ":"){2} h16)? "::" (h16 ":"){2} ls32) |
(((h16 ":"){3} h16)? "::" (h16 ":"){1} ls32) |
(((h16 ":"){4} h16)? "::" ls32) |
(((h16 ":"){5} h16)? "::" h16 ) |
(((h16 ":"){6} h16)? "::" );
IPvFuture = "v" xdigit+ "." (unreserved | sub_delims | ":")+;
IP_literal = "[" (IPv6address | IPvFuture) "]";
reg_name = (unreserved | pct_encoded | sub_delims)*;
host = IP_literal | IPv4address | reg_name;
port = digit*;
# pchar = unreserved | pct_encoded | sub_delims | ":" | "@";
# add (any -- ascii) support chinese
pchar = ( (any -- ascii ) | unreserved | pct_encoded | sub_delims | ":" | "@" );
segment = pchar*;
segment_nz = pchar+;
segment_nz_nc = (pchar - ":")+;
# path = path-abempty ; begins with "/" or is empty
# / path-absolute ; begins with "/" but not "//"
# / path-noscheme ; begins with a non-colon segment
# / path-rootless ; begins with a segment
# / path-empty ; zero characters
path_abempty = ("/" segment)* >marku %save_path;
path_absolute = ("/" (segment_nz ("/" segment)*)?) >marku %save_path;
path_noscheme = segment_nz_nc ("/" segment)* >marku %save_path;
path_rootless = segment_nz ("/" segment)* >marku %save_path;
path_empty = "" >marku %save_path;
authority = (userinfo >marku %save_userinfo "@")? host >marku %save_host (":" port >marku %save_port)?;
query = (pchar | "/" | "?")* >marku %save_query;
fragment = (pchar | "/" | "?")* >marku %save_fragment;
hier_part = ("//" authority path_abempty) | path_absolute | path_rootless | path_empty;
relative_part = ("//" authority path_abempty) | path_absolute | path_noscheme | path_empty;
relative_ref = relative_part ("?" query)? ("#" fragment)?;
URI = scheme ":" hier_part ("?" query)? ("#" fragment)?;
URI_reference = URI | relative_ref;
main := URI_reference;
write data;
}%%
Ragel从测试结果来看效率确实是很高的,而且编码规则也和其他语言的正则非常相似,加上官方文档给出的示例和配图非常多,学起来难度不是很大,有兴趣的可以去看看官方手册
ragel测试程序.zip