Source code for corenlp_xml.document

"""
Sub-module for handling document-level stuff
"""
from lxml import etree
from collections import OrderedDict
from nltk import Tree
from dependencies import DependencyGraph
from coreference import Coreference


[docs]class Document: """ This class abstracts a Stanford CoreNLP Document """ def __init__(self, xml_string): """ Constructor method. :param xml_string: The XML string we're going to parse and represent, coming from CoreNLP :type xml_string: str """ self._sentences_dict = None self._sentiment = None self._xml_string = xml_string self._xml = etree.fromstring(xml_string) self._coreferences = None @property
[docs] def sentiment(self): """ Returns average sentiment of document. Must have sentiment enabled in XML output. :getter: returns average sentiment of the document :type: float """ if self._sentiment is None: results = self._xml.xpath('/root/document/sentences') self._sentiment = float(results[0].get("averageSentiment", 0)) if len(results) > 0 else None return self._sentiment
def _get_sentences_dict(self): """ Returns sentence objects :return: order dict of sentences :rtype: collections.OrderedDict """ if self._sentences_dict is None: sentences = [Sentence(element) for element in self._xml.xpath('/root/document/sentences/sentence')] self._sentences_dict = OrderedDict([(s.id, s) for s in sentences]) return self._sentences_dict @property
[docs] def sentences(self): """ Returns the ordered dict of sentences as a list. :getter: returns list of sentences, in order :type: list of corenlp_xml.document.Sentence """ return self._get_sentences_dict().values()
[docs] def get_sentence_by_id(self, id): """ Gets sentence by ID :param id: the ID of the sentence, as defined in the XML :type id: int :return: a sentence :rtype: corenlp_xml.document.Sentence """ return self._get_sentences_dict().get(id)
@property
[docs] def coreferences(self): """ Returns a list of Coreference classes :getter: Returns a list of coreferences :type: list of corenlp_xml.coreference.Coreference """ if self._coreferences is None: coreferences = self._xml.xpath('/root/document/coreference/coreference') if len(coreferences) > 0: self._coreferences = [Coreference(self, element) for element in coreferences] return self._coreferences
[docs]class Sentence(): """ This abstracts a sentence """ def __init__(self, element): """ Constructor method :param element: An etree element. :type element:class:lxml.etree.ElementBase """ self._id = None self._sentiment = None self._tokens_dict = None self._element = None self._parse = None self._parse_string = None self._basic_dependencies = None self._collapsed_dependencies = None self._collapsed_ccprocessed_dependencies = None self._element = element @property
[docs] def id(self): """ :return: the ID attribute of the sentence :rtype: int """ if self._id is None: self._id = int(self._element.get('id')) return self._id
@property
[docs] def sentiment(self): """ The sentiment of this sentence :getter: Returns the sentiment value of this sentence :type: int """ if self._sentiment is None: self._sentiment = int(self._element.get('sentiment')) return self._sentiment
def _get_tokens_dict(self): """ Accesses tokens dict :return: The ordered dict of the tokens :rtype: collections.OrderedDict """ if self._tokens_dict is None: tokens = [Token(element) for element in self._element.xpath('tokens/token')] self._tokens_dict = OrderedDict([(t.id, t) for t in tokens]) return self._tokens_dict @property
[docs] def tokens(self): """ The tokens related to this sentence :getter: Returns a a list of Token instances :type: corenlp_xml.document.TokenList """ return TokenList(self._get_tokens_dict().values())
[docs] def get_token_by_id(self, id): """ Accesses token by the XML ID :param id: The XML ID of the token :type id: int :return: The token :rtype: corenlp_xml.document.Token """ return self._get_tokens_dict().get(id)
[docs] def subtrees_for_phrase(self, phrase_type): """ Returns subtrees corresponding all phrases matching a given phrase type :param phrase_type: POS such as "NP", "VP", "det", etc. :type phrase_type: str :return: a list of NLTK.Tree.Subtree instances :rtype: list of NLTK.Tree.Subtree """ return [subtree for subtree in self.parse.subtrees() if subtree.node.lower() == phrase_type.lower()]
[docs] def phrase_strings(self, phrase_type): """ Returns strings corresponding all phrases matching a given phrase type :param phrase_type: POS such as "NP", "VP", "det", etc. :type phrase_type: str :return: a list of strings representing those phrases """ return [u" ".join(subtree.leaves()) for subtree in self.subtrees_for_phrase(phrase_type)]
@property
[docs] def semantic_head(self): """ Returns the semantic head of the sentence -- AKA the dependent of the root node of the dependency parse :return: the mention related to the semantic head :rtype: corenlp_xml.coreference.Mention """ return self.basic_dependencies.links_by_type(u"root")[0].dependent
@property
[docs] def parse_string(self): """ Accesses the S-Expression parse string stored on the XML document :getter: Returns the parse string :type: str """ if self._parse_string is None: parse_text = self._element.xpath('parse/text()') if len(parse_text) > 0: self._parse_string = parse_text[0] return self._parse_string
@property
[docs] def parse(self): """ Accesses the parse tree based on the S-expression parse string in the XML :getter: Returns the NLTK parse tree :type: nltk.Tree """ if self.parse_string is not None and self._parse is None: self._parse = Tree.parse(self._parse_string) return self._parse
@property
[docs] def basic_dependencies(self): """ Accesses basic dependencies from the XML output :getter: Returns the dependency graph for basic dependencies :type: corenlp_xml.dependencies.DependencyGraph """ if self._basic_dependencies is None: deps = self._element.xpath('dependencies[@type="basic-dependencies"]') if len(deps) > 0: self._basic_dependencies = DependencyGraph(deps[0]) return self._basic_dependencies
@property
[docs] def collapsed_dependencies(self): """ Accessess collapsed dependencies for this sentence :getter: Returns the dependency graph for collapsed dependencies :type: corenlp_xml.dependencies.DependencyGraph """ if self._basic_dependencies is None: deps = self._element.xpath('dependencies[@type="collapsed-dependencies"]') if len(deps) > 0: self._basic_dependencies = DependencyGraph(deps[0]) return self._basic_dependencies
@property
[docs] def collapsed_ccprocessed_dependencies(self): """ Accesses collapsed, CC-processed dependencies :getter: Returns the dependency graph for collapsed and cc processed dependencies :type: corenlp_xml.dependencies.DependencyGraph """ if self._basic_dependencies is None: deps = self._element.xpath('dependencies[@type="collapsed-ccprocessed-dependencies"]') if len(deps) > 0: self._basic_dependencies = DependencyGraph(deps[0]) return self._basic_dependencies
class TokenList(list): def __init__(self, *args): super(TokenList, self).__init__(args[0]) def __add__(self, rhs): return TokenList(list.__add__(self, rhs)) def __getslice__(self, i ,j): return TokenList(list.__getslice__(self, i, j)) def __add__(self, other): return TokenList(list.__add__(self, other)) def __mul__(self, other): return TokenList(list.__mul__(self, other)) def __str__(self): return " ".join([token.word for token in self]) def __getitem__(self, item): result = list.__getitem__(self, item) try: return TokenList(result) except TypeError: return result
[docs]class Token(): """ Wraps the token XML element """ def __init__(self, element): """ Constructor method :param element: An etree element :type element: lxml.etree.ElementBase """ self._id = None self._word = None self._lemma = None self._character_offset_begin = None self._character_offset_end = None self._pos = None self._ner = None self._speaker = None self._element = element @property
[docs] def id(self): """ Lazy-loads ID :getter: Returns the ID of the token element :type: int """ if self._id is None: self._id = int(self._element.get('id')) return self._id
@property
[docs] def word(self): """ Lazy-loads word value :getter: Returns the plain string value of the word :type: str """ if self._word is None: words = self._element.xpath('word/text()') if len(words) > 0: self._word = words[0] return self._word
@property
[docs] def lemma(self): """ Lazy-loads the lemma for this word :getter: Returns the plain string value of the word lemma :type: str """ if self._lemma is None: lemmata = self._element.xpath('lemma/text()') if len(lemmata) > 0: self._lemma = lemmata[0] return self._lemma
@property
[docs] def character_offset_begin(self): """ Lazy-loads character offset begin node :getter: Returns the integer value of the beginning offset :type: int """ if self._character_offset_begin is None: offsets = self._element.xpath('CharacterOffsetBegin/text()') if len(offsets) > 0: self._character_offset_begin = int(offsets[0]) return self._character_offset_begin
@property
[docs] def character_offset_end(self): """ Lazy-loads character offset end node :getter: Returns the integer value of the ending offset :type: int """ if self._character_offset_end is None: offsets = self._element.xpath('CharacterOffsetEnd/text()') if len(offsets) > 0: self._character_offset_end = int(offsets[0]) return self._character_offset_end
@property
[docs] def pos(self): """ Lazy-loads the part of speech tag for this word :getter: Returns the plain string value of the POS tag for the word :type: str """ if self._pos is None: poses = self._element.xpath('POS/text()') if len(poses) > 0: self._pos = poses[0] return self._pos
@property
[docs] def ner(self): """ Lazy-loads the NER for this word :getter: Returns the plain string value of the NER tag for the word :type: str """ if self._ner is None: ners = self._element.xpath('NER/text()') if len(ners) > 0: self._ner = ners[0] return self._ner
@property
[docs] def speaker(self): """ Lazy-loads the speaker for this word :getter: Returns the plain string value of the speaker tag for the word :type: str """ if self._speaker is None: speakers = self._element.xpath('Speaker/text()') if len(speakers) > 0: self._speaker = speakers[0] return self._speaker