[elasticsearch笔记] Analysis - Tokenizer




  • The standard tokenizer provides grammar based tokenization (based on the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29) and works well for most languages.
PUT standard_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "standard",
          "max_token_length": 5

POST standard_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."


  • The letter tokenizer breaks text into terms whenever it encounters a character which is not a letter.
POST _analyze
  "tokenizer": "letter",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."


  • The lowercase tokenizer, like the letter tokenizer breaks text into terms whenever it encounters a character which is not a letter, but it also lowercases all terms.
POST _analyze
  "tokenizer": "lowercase",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."


  • The whitespace tokenizer breaks text into terms whenever it encounters a whitespace character.
POST _analyze
  "tokenizer": "whitespace",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."


  • The uax_url_email tokenizer is like the standard tokenizer except that it recognises URLs and email addresses as single tokens.
POST _analyze
  "tokenizer": "uax_url_email",
  "text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"

DELETE uax_url_email_tokenizer_index
PUT uax_url_email_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "uax_url_email",

POST uax_url_email_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"


  • It splits words at most punctuation characters, removing punctuation. However, a dot that’s not followed by whitespace is considered part of a token.
  • It splits words at hyphens, unless there’s a number in the token, in which case the whole token is interpreted as a product number and is not split.
  • It recognizes email addresses and internet hostnames as one token.
POST _analyze
  "tokenizer": "classic",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."

PUT classic_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "classic",
          "max_token_length": 5

POST classic_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."


  • The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length.
POST _analyze
  "tokenizer": "ngram",
  "text": "Quick Fox"

PUT ngram_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 3,
          "token_chars": [

POST ngram_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "2 Quick Foxes."


  • The edge_ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word where the start of the N-gram is anchored to the beginning of the word.
POST _analyze
  "tokenizer": "edge_ngram",
  "text": "Quick Fox"

PUT edge_ngram_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10,
          "token_chars": [

POST edge_ngram_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "2 Quick Foxes."

PUT autocomplete_index
  "settings": {
    "analysis": {
      "analyzer": {
        "autocomplete": {
          "tokenizer": "autocomplete",
          "filter": [
        "autocomplete_search": {
          "tokenizer": "lowercase"
      "tokenizer": {
        "autocomplete": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10,
          "token_chars": [
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "autocomplete",
        "search_analyzer": "autocomplete_search"

PUT autocomplete_index/_doc/1
  "title": "Quick Foxes" 

POST autocomplete_index/_refresh

GET autocomplete_index/_search
  "query": {
    "match": {
      "title": {
        "query": "Quick Fo", 
        "operator": "and"

GET autocomplete_index/_analyze
  "field": "title",
  "text":"Quick Foxes"

GET autocomplete_index/_analyze
  "field": "title",
  "text":"Quick Fo"


POST _analyze
  "tokenizer": "keyword",
  "text": "New York"


POST _analyze
  "tokenizer": "pattern",
  "text": "The foo_bar_size's default is 5."

PUT pattern_tokenizer_index_v1
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "pattern",
          "pattern": ","

POST pattern_tokenizer_index_v1/_analyze
  "analyzer": "my_analyzer",
  "text": "comma,separated,values"

# "((?:\\"|[^"]|\\")*)": In the next example, we configure the pattern tokenizer to capture values enclosed in double quotes (ignoring embedded escaped quotes \").
PUT pattern_tokenizer_index_v2
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "pattern",
          "pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"",
          "group": 1

POST pattern_tokenizer_index_v2/_analyze
  "analyzer": "my_analyzer",
  "text": "\"value\", \"value with embedded \\\" quote\""


  • The char_group tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the pattern tokenizer is not acceptable.
POST _analyze
  "tokenizer": {
    "type": "char_group",
    "tokenize_on_chars": [
  "text": "The QUICK brown-fox"


  • experimental
PUT simple_pattern_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern",
          "pattern": "[0123456789]{3}"

POST simple_pattern_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "fd-786-335-514-12-x"


PUT simple_pattern_split_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "_"

POST simple_pattern_split_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "an_underscored_phrase"

DELETE simple_pattern_split_tokenizer_index_v2
PUT simple_pattern_split_tokenizer_index_v2
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "[0123456789]{3}|-"

POST simple_pattern_split_tokenizer_index_v2/_analyze
  "analyzer": "my_analyzer",
  "text": "fd-786-335-514-12-x"


# path_hierarchy
POST _analyze
  "tokenizer": "path_hierarchy",
  "text": "/one/two/three"

DELETE path_hierarchy_tokenizer_index
PUT path_hierarchy_tokenizer_index
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
      "tokenizer": {
        "my_tokenizer": {
          "type": "path_hierarchy",
          "delimiter": "-",
          "replacement": "/",
          "skip": 2,

POST path_hierarchy_tokenizer_index/_analyze
  "analyzer": "my_analyzer",
  "text": "one-two-three-four-five"

PUT file-path-test
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_path_tree": {
          "tokenizer": "custom_hierarchy"
        "custom_path_tree_reversed": {
          "tokenizer": "custom_hierarchy_reversed"
      "tokenizer": {
        "custom_hierarchy": {
          "type": "path_hierarchy",
          "delimiter": "/"
        "custom_hierarchy_reversed": {
          "type": "path_hierarchy",
          "delimiter": "/",
          "reverse": "true"
  "mappings": {
    "properties": {
      "file_path": {
        "type": "text",
        "fields": {
          "tree": {
            "type": "text",
            "analyzer": "custom_path_tree"
          "tree_reversed": {
            "type": "text",
            "analyzer": "custom_path_tree_reversed"

POST file-path-test/_bulk
{"file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"}
{"file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"}
{"file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"}
{"file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"}
{"file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"}

GET file-path-test/_search
  "query": {
    "match": {
      "file_path": "/User/bob/photos/2017/05"

GET file-path-test/_search
  "query": {
    "term": {
      "file_path.tree": "/User/alice/photos/2017/05/16"

GET file-path-test/_search
  "query": {
    "term": {
      "file_path.tree_reversed": {
        "value": "my_photo1.jpg"

POST file-path-test/_analyze
  "analyzer": "custom_path_tree",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"

POST file-path-test/_analyze
  "analyzer": "custom_path_tree_reversed",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"

GET file-path-test/_search
  "query": {
    "bool" : {
      "must" : {
        "match" : { "file_path" : "16" }
      "filter": {
        "term" : { "file_path.tree" : "/User/alice" }