我试图从我的语料库中提取短语。为此,我定义了两个规则,一个是名词后跟多个名词,另一个是形容词后接名词,这里我希望如果从两个规则中提取相同的短语,程序应该忽略第二个规则,我面临的问题是这些短语只从第一个规则中提取,并且第二条规则不适用。
代码如下:PATTERN = r"""
NP: {+}
{*}
"""
MIN_FREQ = 1
MIN_CVAL = -13 # lowest cval -13
def __init__(self):
corpus_root = os.path.abspath('../multiwords/test')
self.corpus = nltk.corpus.reader.TaggedCorpusReader(corpus_root,'.*')
self.word_count_by_document = None
self.phrase_frequencies = None
def calculate_phrase_frequencies(self):
"""
extract the sentence chunks according to PATTERN and calculate
the frequency of chunks with pos tags
"""
# pdb.set_trace()
chunk_freq_dict = defaultdict(int)
chunker = nltk.RegexpParser(self.PATTERN)
for sent in self.corpus.tagged_sents():
sent = [s for s in sent if s[1] is not None]
for chk in chunker.parse(sent).subtrees():
if str(chk).startswith('(NP'):
phrase = chk.__unicode__()[4:-1]
if '\n' in phrase:
phrase = ' '.join(phrase.split())
just_phrase = ' '.join([w.rsplit('/', 1)[0] for w in phrase.split(' ')])
# print(just_phrase)
chunk_freq_dict[just_phrase] += 1
self.phrase_frequencies = chunk_freq_dict
#print(self.phrase_frequencies)