boost::tokenizer分词器

庞修贤
2023-12-01

tokenizer - Break of a string or other character sequence into a series of tokens, from John Bandela
tokenizer - 分解字串,提取内容.作者: John Bandela

例一:

// simple_example_1.cpp
#include<iostream>
#include<boost/tokenizer.hpp>
#include<string>
int main(){
    using namespace std;
    using namespace boost;
    string s = "This is,  a test";
    tokenizer<> tok(s);
    for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end();++beg){
       cout << *beg << "/n";
   }
}

输出
This
is
a
test

tokenizer默认将单词以空格和标点为边界分开.

例二:

#include<iostream>
#include<boost/tokenizer.hpp>
#include<string>
int main(){
    using namespace std;
    using namespace boost;
    string s = "Field 1,/"putting quotes around fields, allows commas/",Field 3";
    tokenizer<escaped_list_separator<char> > tok(s);
    for(tokenizer<escaped_list_separator<char> >::iterator beg=tok.begin(); beg!=tok.end();++beg){
       cout << *beg << "/n";
   }
}

输出
Field 1
putting quotes around fields, allows commas
Field 3

双引号之间可以有标点.


例三:

// simple_example_3.cpp
#include<iostream>
#include<boost/tokenizer.hpp>
#include<string>
int main(){
    using namespace std;
    using namespace boost;
    string s = "12252001";
    int offsets[] = {2,2,4};
    offset_separator f(offsets, offsets+3);
    tokenizer<offset_separator> tok(s,f);
    for(tokenizer<offset_separator>::iterator beg=tok.begin(); beg!=tok.end();++beg){
       cout << *beg << "/n";
   }
}

把12252001分解为
12
25
2001

例4:

// char_sep_example_1.cpp
#include <iostream>
#include <boost/tokenizer.hpp>
#include <string>
int main()
{
    std::string str = ";!!;Hello|world||-foo--bar;yow;baz|";
    typedef boost::tokenizer<boost::char_separator<char> >
    tokenizer;
    boost::char_separator<char> sep("-;|");
    tokenizer tokens(str, sep);
    for (tokenizer::iterator tok_iter = tokens.begin();
       tok_iter != tokens.end(); ++tok_iter)
    std::cout << "<" << *tok_iter << "> ";
    std::cout << "/n";
    return EXIT_SUCCESS;
}

输出
<!!> <Hello> <world> <foo> <bar> <yow> <baz>
自定义分隔的标点

例5:
   

 // char_sep_example_2.cpp
    #include <iostream>
    #include <boost/tokenizer.hpp>
    #include <string>
    int main()
    {
        std::string str = ";;Hello|world||-foo--bar;yow;baz|";
        typedef boost::tokenizer<boost::char_separator<char> >
            tokenizer;
        boost::char_separator<char> sep("-;", "|", boost::keep_empty_tokens);
        tokenizer tokens(str, sep);
        for (tokenizer::iterator tok_iter = tokens.begin();
           tok_iter != tokens.end(); ++tok_iter)
               std::cout << "<" << *tok_iter << "> ";
        std::cout << "/n";
        return EXIT_SUCCESS;
    }

The output is:

    <> <> <Hello> <|> <world> <|> <> <|> <> <foo> <> <bar> <yow> <baz> <|> <>
去除-; , 保留|但将它看作是分隔符,当两个分隔符相邻的时候会自动加空格

例6:
    

// char_sep_example_3.cpp
    #include <iostream>
    #include <boost/tokenizer.hpp>
    #include <string>
    int main()
    {
       std::string str = "This is,  a test";
       typedef boost::tokenizer<boost::char_separator<char> > Tok;
       boost::char_separator<char> sep; // default constructed
       Tok tok(str, sep);
       for(Tok::iterator tok_iter = tok.begin(); tok_iter != tok.end(); ++tok_iter)
          std::cout << "<" << *tok_iter << "> ";
       std::cout << "/n";
       return EXIT_SUCCESS;
    }

The output is:

    <This> <is> <,> <a> <test>
保留标点但将它看作分隔符

 类似资料: