2016-04-09 16 views
0

Metni çözümlemek için "Python 3 Text Processing with NLTK3 Cookbook" kullanıyorum. 2 yer işareti "LocationChunker" ve "PersonChunker" ı yarattım ve iyi çalışıyorlar.NLTK'de farklı yığınlayıcılar arka arkaya nasıl kullanılır?

Her yere baktım ama bir cümleyi analiz etmek için her ikisini aynı anda nasıl kullanabilirsin? Bundan sonra ne_chunk işlevini kullanmak istiyorum. POStaggers ile

bir backoff ilan etmek son derece kolaydır ama ChunkParserI ile

Çok teşekkürler bunu nasıl.

+0

http://www.nltk.org/howto/chunk.html? – alvas

+0

Zaten bu web sayfasını okudum ve buradaki soruma bir çözüm bulunmuyor. –

cevap

0

Aşağıdaki kod, bahsettiğiniz örneklere dayanarak yaptığım basit bir Gazete'dir. Gelen

# -*- coding: utf-8 -*- 
import codecs 
from lxml.html.builder import DT 
import os 
import re 

from nltk.chunk.util import conlltags2tree 
from nltk.chunk import ChunkParserI 
from nltk.tag import pos_tag 
from nltk.tokenize import wordpunct_tokenize 


def sub_leaves(tree, node): 
    return [t.leaves() for t in tree.subtrees(lambda s: s.node == node)] 


class Gazetteer(ChunkParserI): 
    """ 
    Find and annotate a list of words that matches patterns. 
    Patterns may be regular expressions in the form list of tuples. 
    Every tuple has the regular expression and the iob tag for this one. 
    Before applying gazetteer words a part of speech tagging should 
    be performed. So, you have to pass your tagger as a parameter. 
    Example: 
     >>> patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")] 
     >>> gazetteer = Gazetteer(patterns, nltk.pos_tag, nltk.wordpunct_tokenize) 
     >>> text = u"Η Νομική σχολή της Αθήνας" 
     >>> t = gazetteer.parse(text) 
     >>> print(unicode(t)) 
     ... (S Η/DT (ORG Νομική/NN σχολή/NN) της/DT (LOC Αθήνας/NN)) 
    """ 

    def __init__(self, patterns, pos_tagger, tokenizer): 
     """ 
     Initialize the class. 

     :param patterns: 
      The patterns to search in text is a list of tuples with regular 
      expression and the tag to apply 
     :param pos_tagger: 
      The tagger to use for applying part of speech to the text 
     :param tokenizer: 
      The tokenizer to use for tokenizing the text 
     """ 
     self.patterns = patterns 
     self.pos_tag = pos_tagger 
     self.tokenize = tokenizer 
     self.lookahead = 0 # how many words it is possible to be a gazetteer word 
     self.words = [] # Keep the words found by applying the regular expressions 
     self.iobtags = [] # For each set of words keep the coresponding tag 

    def iob_tags(self, tagged_sent): 
     """ 
     Search the tagged sentences for gazetteer words and apply their iob tags. 

     :param tagged_sent: 
      A tokenized text with part of speech tags 
     :type tagged_sent: list 
     :return: 
      yields the IOB tag of the word with it's character, eg. B-LOCATION 
     :rtype: 
     """ 
     i = 0 
     l = len(tagged_sent) 
     inside = False # marks the I- tag 
     iobs = [] 

     while i < l: 
      word, pos_tag = tagged_sent[i] 
      j = i + 1 # the next word 
      k = j + self.lookahead # how many words in a row we may search 
      nextwords, nexttags = [], [] # for now, just the ith word 
      add_tag = False # no tag, this is O 

      while j <= k: 
       words = ' '.join([word] + nextwords) # expand our word list 
       if words in self.words: # search for words 
        index = self.words.index(words) # keep index to use for iob tags 
        if inside: 
         iobs.append((word, pos_tag, 'I-' + self.iobtags[index])) # use the index tag 
        else: 
         iobs.append((word, pos_tag, 'B-' + self.iobtags[index])) 

        for nword, ntag in zip(nextwords, nexttags): # there was more than one word 
         iobs.append((nword, ntag, 'I-' + self.iobtags[index])) # apply I- tag to all of them 

        add_tag, inside = True, True 
        i = j # skip tagged words 
        break 

       if j < l: # we haven't reach the length of tagged sentences 
        nextword, nexttag = tagged_sent[j] # get next word and it's tag 
        nextwords.append(nextword) 
        nexttags.append(nexttag) 
        j += 1 
       else: 
        break 

      if not add_tag: # unkown words 
       inside = False 
       i += 1 
       iobs.append((word, pos_tag, 'O')) # it's an Outsider 

     return iobs 

    def parse(self, text, conlltags=True): 
     """ 
     Given a text, applies tokenization, part of speech tagging and the 
     gazetteer words with their tags. Returns an conll tree. 

     :param text: The text to parse 
     :type text: str 
     :param conlltags: 
     :type conlltags: 
     :return: An conll tree 
     :rtype: 
     """ 
     # apply the regular expressions and find all the 
     # gazetteer words in text 
     for pattern, tag in self.patterns: 
      words_found = set(re.findall(pattern, text)) # keep the unique words 
      if len(words_found) > 0: 
       for word in words_found: # words_found may be more than one 
        self.words.append(word) # keep the words 
        self.iobtags.append(tag) # and their tag 

     # find the pattern with the maximum words. 
     # this will be the look ahead variable 
     for word in self.words: # don't care about tags now 
      nwords = word.count(' ') 
      if nwords > self.lookahead: 
       self.lookahead = nwords 

     # tokenize and apply part of speech tagging 
     tagged_sent = self.pos_tag(self.tokenize(text)) 
     # find the iob tags 
     iobs = self.iob_tags(tagged_sent) 

     if conlltags: 
      return conlltags2tree(iobs) 
     else: 
      return iobs 


if __name__ == "__main__": 
    patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")] 
    g = Gazetteer(patterns, pos_tag, wordpunct_tokenize) 
    text = u"Η Νομική σχολή της Αθήνας" 
    t = g.parse(text) 
    print(unicode(t)) 


    dir_with_lists = "Lists" 
    patterns = [] 
    tags = [] 
    for root, dirs, files in os.walk(dir_with_lists): 
     for f in files: 
      lines = codecs.open(os.path.join(root, f), 'r', 'utf-8').readlines() 
      tag = os.path.splitext(f)[0] 
      for l in lines[1:]: 
       patterns.append((l.rstrip(), tag)) 
       tags.append(tag) 

    text = codecs.open("sample.txt", 'r', "utf-8").read() 
    #g = Gazetteer(patterns) 
    t = g.parse(text.lower()) 
    print unicode(t) 

    for tag in set(tags): 
     for gaz_word in sub_leaves(t, tag): 
      print gaz_word[0][0], tag 

if __name__ == "__main__": kod patterns = [(u"Αθήνα[ς]?", "LOC"), (u"Νομική[ς]? [Σσ]χολή[ς]?", "ORG")] desenleri yapmak nerede bir örnek görebilirsiniz.

Kodda daha sonra, Lists adlı bir dizinden dosyaları okuyun (yukarıdaki koda sahip olduğunuz klasöre koyun). Her dosyanın adı Gazete’nin etiketi olur. Bu nedenle, LOC.txt gibi konumlar için desenleri (LOC etiketi), Kişiler için PERSON.txt gibi dosyaları yapın.

İlgili konular