[elasticsearch笔记] Analysis - Tokenizer

周通
2023-12-01

demo

standard

  • The standard tokenizer provides grammar based tokenization (based on the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29) and works well for most languages.
PUT standard_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "standard",
          "max_token_length": 5
        }
      }
    }
  }
}

POST standard_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

letter

  • The letter tokenizer breaks text into terms whenever it encounters a character which is not a letter.
POST _analyze
{
  "tokenizer": "letter",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

lowercase

  • The lowercase tokenizer, like the letter tokenizer breaks text into terms whenever it encounters a character which is not a letter, but it also lowercases all terms.
POST _analyze
{
  "tokenizer": "lowercase",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

whitespace

  • The whitespace tokenizer breaks text into terms whenever it encounters a whitespace character.
POST _analyze
{
  "tokenizer": "whitespace",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

uax_url_email

  • The uax_url_email tokenizer is like the standard tokenizer except that it recognises URLs and email addresses as single tokens.
POST _analyze
{
  "tokenizer": "uax_url_email",
  "text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"
}

DELETE uax_url_email_tokenizer_index
PUT uax_url_email_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "uax_url_email",
          "max_token_length":50
        }
      }
    }
  }
}

POST uax_url_email_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "my home page is www.zhengcj01.com and the email is zhengcj01@test.com"
}

classic

  • It splits words at most punctuation characters, removing punctuation. However, a dot that’s not followed by whitespace is considered part of a token.
  • It splits words at hyphens, unless there’s a number in the token, in which case the whole token is interpreted as a product number and is not split.
  • It recognizes email addresses and internet hostnames as one token.
POST _analyze
{
  "tokenizer": "classic",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

PUT classic_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "classic",
          "max_token_length": 5
        }
      }
    }
  }
}

POST classic_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

ngram

  • The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length.
POST _analyze
{
  "tokenizer": "ngram",
  "text": "Quick Fox"
}

PUT ngram_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 3,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  }
}

POST ngram_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "2 Quick Foxes."
}

edge_ngram

  • The edge_ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word where the start of the N-gram is anchored to the beginning of the word.
POST _analyze
{
  "tokenizer": "edge_ngram",
  "text": "Quick Fox"
}

PUT edge_ngram_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  }
}

POST edge_ngram_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "2 Quick Foxes."
}

PUT autocomplete_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "autocomplete": {
          "tokenizer": "autocomplete",
          "filter": [
            "lowercase"
          ]
        },
        "autocomplete_search": {
          "tokenizer": "lowercase"
        }
      },
      "tokenizer": {
        "autocomplete": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10,
          "token_chars": [
            "letter"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "autocomplete",
        "search_analyzer": "autocomplete_search"
      }
    }
  }
}

PUT autocomplete_index/_doc/1
{
  "title": "Quick Foxes" 
}

POST autocomplete_index/_refresh

GET autocomplete_index/_search
{
  "query": {
    "match": {
      "title": {
        "query": "Quick Fo", 
        "operator": "and"
      }
    }
  }
}

GET autocomplete_index/_analyze
{
  "field": "title",
  "text":"Quick Foxes"
}

GET autocomplete_index/_analyze
{
  "field": "title",
  "text":"Quick Fo"
}

keyword

POST _analyze
{
  "tokenizer": "keyword",
  "text": "New York"
}

pattern

POST _analyze
{
  "tokenizer": "pattern",
  "text": "The foo_bar_size's default is 5."
}

PUT pattern_tokenizer_index_v1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "pattern",
          "pattern": ","
        }
      }
    }
  }
}

POST pattern_tokenizer_index_v1/_analyze
{
  "analyzer": "my_analyzer",
  "text": "comma,separated,values"
}

# "((?:\\"|[^"]|\\")*)": In the next example, we configure the pattern tokenizer to capture values enclosed in double quotes (ignoring embedded escaped quotes \").
PUT pattern_tokenizer_index_v2
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "pattern",
          "pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"",
          "group": 1
        }
      }
    }
  }
}

POST pattern_tokenizer_index_v2/_analyze
{
  "analyzer": "my_analyzer",
  "text": "\"value\", \"value with embedded \\\" quote\""
}

char_group

  • The char_group tokenizer breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful for cases where a simple custom tokenization is desired, and the overhead of use of the pattern tokenizer is not acceptable.
POST _analyze
{
  "tokenizer": {
    "type": "char_group",
    "tokenize_on_chars": [
      "whitespace",
      "-",
      "\n"
    ]
  },
  "text": "The QUICK brown-fox"
}

simple_pattern

  • experimental
PUT simple_pattern_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern",
          "pattern": "[0123456789]{3}"
        }
      }
    }
  }
}

POST simple_pattern_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "fd-786-335-514-12-x"
}

simple_pattern_split

PUT simple_pattern_split_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "_"
        }
      }
    }
  }
}

POST simple_pattern_split_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "an_underscored_phrase"
}

DELETE simple_pattern_split_tokenizer_index_v2
PUT simple_pattern_split_tokenizer_index_v2
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "[0123456789]{3}|-"
        }
      }
    }
  }
}

POST simple_pattern_split_tokenizer_index_v2/_analyze
{
  "analyzer": "my_analyzer",
  "text": "fd-786-335-514-12-x"
}

path_hierarchy

#
# path_hierarchy
#
POST _analyze
{
  "tokenizer": "path_hierarchy",
  "text": "/one/two/three"
}

DELETE path_hierarchy_tokenizer_index
PUT path_hierarchy_tokenizer_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "path_hierarchy",
          "delimiter": "-",
          "replacement": "/",
          "skip": 2,
          "reverse":true
        }
      }
    }
  }
}

POST path_hierarchy_tokenizer_index/_analyze
{
  "analyzer": "my_analyzer",
  "text": "one-two-three-four-five"
}

PUT file-path-test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_path_tree": {
          "tokenizer": "custom_hierarchy"
        },
        "custom_path_tree_reversed": {
          "tokenizer": "custom_hierarchy_reversed"
        }
      },
      "tokenizer": {
        "custom_hierarchy": {
          "type": "path_hierarchy",
          "delimiter": "/"
        },
        "custom_hierarchy_reversed": {
          "type": "path_hierarchy",
          "delimiter": "/",
          "reverse": "true"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "file_path": {
        "type": "text",
        "fields": {
          "tree": {
            "type": "text",
            "analyzer": "custom_path_tree"
          },
          "tree_reversed": {
            "type": "text",
            "analyzer": "custom_path_tree_reversed"
          }
        }
      }
    }
  }
}

POST file-path-test/_bulk
{"index":{"_id":1}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo1.jpg"}
{"index":{"_id":2}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo2.jpg"}
{"index":{"_id":3}}
{"file_path": "/User/alice/photos/2017/05/16/my_photo3.jpg"}
{"index":{"_id":4}}
{"file_path": "/User/alice/photos/2017/05/15/my_photo1.jpg"}
{"index":{"_id":5}}
{"file_path": "/User/bob/photos/2017/05/16/my_photo1.jpg"}

GET file-path-test/_search
{
  "query": {
    "match": {
      "file_path": "/User/bob/photos/2017/05"
    }
  }
}

GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree": "/User/alice/photos/2017/05/16"
    }
  }
}

GET file-path-test/_search
{
  "query": {
    "term": {
      "file_path.tree_reversed": {
        "value": "my_photo1.jpg"
      }
    }
  }
}

POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

POST file-path-test/_analyze
{
  "analyzer": "custom_path_tree_reversed",
  "text": "/User/alice/photos/2017/05/16/my_photo1.jpg"
}

GET file-path-test/_search
{
  "query": {
    "bool" : {
      "must" : {
        "match" : { "file_path" : "16" }
      },
      "filter": {
        "term" : { "file_path.tree" : "/User/alice" }
      }
    }
  }
}
 类似资料: