"""
Sub-module for handling document-level stuff
"""
from lxml import etree
from collections import OrderedDict
from nltk import Tree
from dependencies import DependencyGraph
from coreference import Coreference
[docs]class Document:
"""
This class abstracts a Stanford CoreNLP Document
"""
def __init__(self, xml_string):
"""
Constructor method.
:param xml_string: The XML string we're going to parse and represent, coming from CoreNLP
:type xml_string: str
"""
self._sentences_dict = None
self._sentiment = None
self._xml_string = xml_string
self._xml = etree.fromstring(xml_string)
self._coreferences = None
@property
[docs] def sentiment(self):
"""
Returns average sentiment of document. Must have sentiment enabled in XML output.
:getter: returns average sentiment of the document
:type: float
"""
if self._sentiment is None:
results = self._xml.xpath('/root/document/sentences')
self._sentiment = float(results[0].get("averageSentiment", 0)) if len(results) > 0 else None
return self._sentiment
def _get_sentences_dict(self):
"""
Returns sentence objects
:return: order dict of sentences
:rtype: collections.OrderedDict
"""
if self._sentences_dict is None:
sentences = [Sentence(element) for element in self._xml.xpath('/root/document/sentences/sentence')]
self._sentences_dict = OrderedDict([(s.id, s) for s in sentences])
return self._sentences_dict
@property
[docs] def sentences(self):
"""
Returns the ordered dict of sentences as a list.
:getter: returns list of sentences, in order
:type: list of corenlp_xml.document.Sentence
"""
return self._get_sentences_dict().values()
[docs] def get_sentence_by_id(self, id):
"""
Gets sentence by ID
:param id: the ID of the sentence, as defined in the XML
:type id: int
:return: a sentence
:rtype: corenlp_xml.document.Sentence
"""
return self._get_sentences_dict().get(id)
@property
[docs] def coreferences(self):
"""
Returns a list of Coreference classes
:getter: Returns a list of coreferences
:type: list of corenlp_xml.coreference.Coreference
"""
if self._coreferences is None:
coreferences = self._xml.xpath('/root/document/coreference/coreference')
if len(coreferences) > 0:
self._coreferences = [Coreference(self, element) for element in coreferences]
return self._coreferences
[docs]class Sentence():
"""
This abstracts a sentence
"""
def __init__(self, element):
"""
Constructor method
:param element: An etree element.
:type element:class:lxml.etree.ElementBase
"""
self._id = None
self._sentiment = None
self._tokens_dict = None
self._element = None
self._parse = None
self._parse_string = None
self._basic_dependencies = None
self._collapsed_dependencies = None
self._collapsed_ccprocessed_dependencies = None
self._element = element
@property
[docs] def id(self):
"""
:return: the ID attribute of the sentence
:rtype: int
"""
if self._id is None:
self._id = int(self._element.get('id'))
return self._id
@property
[docs] def sentiment(self):
"""
The sentiment of this sentence
:getter: Returns the sentiment value of this sentence
:type: int
"""
if self._sentiment is None:
self._sentiment = int(self._element.get('sentiment'))
return self._sentiment
def _get_tokens_dict(self):
"""
Accesses tokens dict
:return: The ordered dict of the tokens
:rtype: collections.OrderedDict
"""
if self._tokens_dict is None:
tokens = [Token(element) for element in self._element.xpath('tokens/token')]
self._tokens_dict = OrderedDict([(t.id, t) for t in tokens])
return self._tokens_dict
@property
[docs] def tokens(self):
"""
The tokens related to this sentence
:getter: Returns a a list of Token instances
:type: corenlp_xml.document.TokenList
"""
return TokenList(self._get_tokens_dict().values())
[docs] def get_token_by_id(self, id):
"""
Accesses token by the XML ID
:param id: The XML ID of the token
:type id: int
:return: The token
:rtype: corenlp_xml.document.Token
"""
return self._get_tokens_dict().get(id)
[docs] def subtrees_for_phrase(self, phrase_type):
"""
Returns subtrees corresponding all phrases matching a given phrase type
:param phrase_type: POS such as "NP", "VP", "det", etc.
:type phrase_type: str
:return: a list of NLTK.Tree.Subtree instances
:rtype: list of NLTK.Tree.Subtree
"""
return [subtree for subtree in self.parse.subtrees() if subtree.node.lower() == phrase_type.lower()]
[docs] def phrase_strings(self, phrase_type):
"""
Returns strings corresponding all phrases matching a given phrase type
:param phrase_type: POS such as "NP", "VP", "det", etc.
:type phrase_type: str
:return: a list of strings representing those phrases
"""
return [u" ".join(subtree.leaves()) for subtree in self.subtrees_for_phrase(phrase_type)]
@property
[docs] def semantic_head(self):
"""
Returns the semantic head of the sentence -- AKA the dependent of the root node of the dependency parse
:return: the mention related to the semantic head
:rtype: corenlp_xml.coreference.Mention
"""
return self.basic_dependencies.links_by_type(u"root")[0].dependent
@property
[docs] def parse_string(self):
"""
Accesses the S-Expression parse string stored on the XML document
:getter: Returns the parse string
:type: str
"""
if self._parse_string is None:
parse_text = self._element.xpath('parse/text()')
if len(parse_text) > 0:
self._parse_string = parse_text[0]
return self._parse_string
@property
[docs] def parse(self):
"""
Accesses the parse tree based on the S-expression parse string in the XML
:getter: Returns the NLTK parse tree
:type: nltk.Tree
"""
if self.parse_string is not None and self._parse is None:
self._parse = Tree.parse(self._parse_string)
return self._parse
@property
[docs] def basic_dependencies(self):
"""
Accesses basic dependencies from the XML output
:getter: Returns the dependency graph for basic dependencies
:type: corenlp_xml.dependencies.DependencyGraph
"""
if self._basic_dependencies is None:
deps = self._element.xpath('dependencies[@type="basic-dependencies"]')
if len(deps) > 0:
self._basic_dependencies = DependencyGraph(deps[0])
return self._basic_dependencies
@property
[docs] def collapsed_dependencies(self):
"""
Accessess collapsed dependencies for this sentence
:getter: Returns the dependency graph for collapsed dependencies
:type: corenlp_xml.dependencies.DependencyGraph
"""
if self._basic_dependencies is None:
deps = self._element.xpath('dependencies[@type="collapsed-dependencies"]')
if len(deps) > 0:
self._basic_dependencies = DependencyGraph(deps[0])
return self._basic_dependencies
@property
[docs] def collapsed_ccprocessed_dependencies(self):
"""
Accesses collapsed, CC-processed dependencies
:getter: Returns the dependency graph for collapsed and cc processed dependencies
:type: corenlp_xml.dependencies.DependencyGraph
"""
if self._basic_dependencies is None:
deps = self._element.xpath('dependencies[@type="collapsed-ccprocessed-dependencies"]')
if len(deps) > 0:
self._basic_dependencies = DependencyGraph(deps[0])
return self._basic_dependencies
class TokenList(list):
def __init__(self, *args):
super(TokenList, self).__init__(args[0])
def __add__(self, rhs):
return TokenList(list.__add__(self, rhs))
def __getslice__(self, i ,j):
return TokenList(list.__getslice__(self, i, j))
def __add__(self, other):
return TokenList(list.__add__(self, other))
def __mul__(self, other):
return TokenList(list.__mul__(self, other))
def __str__(self):
return " ".join([token.word for token in self])
def __getitem__(self, item):
result = list.__getitem__(self, item)
try:
return TokenList(result)
except TypeError:
return result
[docs]class Token():
"""
Wraps the token XML element
"""
def __init__(self, element):
"""
Constructor method
:param element: An etree element
:type element: lxml.etree.ElementBase
"""
self._id = None
self._word = None
self._lemma = None
self._character_offset_begin = None
self._character_offset_end = None
self._pos = None
self._ner = None
self._speaker = None
self._element = element
@property
[docs] def id(self):
"""
Lazy-loads ID
:getter: Returns the ID of the token element
:type: int
"""
if self._id is None:
self._id = int(self._element.get('id'))
return self._id
@property
[docs] def word(self):
"""
Lazy-loads word value
:getter: Returns the plain string value of the word
:type: str
"""
if self._word is None:
words = self._element.xpath('word/text()')
if len(words) > 0:
self._word = words[0]
return self._word
@property
[docs] def lemma(self):
"""
Lazy-loads the lemma for this word
:getter: Returns the plain string value of the word lemma
:type: str
"""
if self._lemma is None:
lemmata = self._element.xpath('lemma/text()')
if len(lemmata) > 0:
self._lemma = lemmata[0]
return self._lemma
@property
[docs] def character_offset_begin(self):
"""
Lazy-loads character offset begin node
:getter: Returns the integer value of the beginning offset
:type: int
"""
if self._character_offset_begin is None:
offsets = self._element.xpath('CharacterOffsetBegin/text()')
if len(offsets) > 0:
self._character_offset_begin = int(offsets[0])
return self._character_offset_begin
@property
[docs] def character_offset_end(self):
"""
Lazy-loads character offset end node
:getter: Returns the integer value of the ending offset
:type: int
"""
if self._character_offset_end is None:
offsets = self._element.xpath('CharacterOffsetEnd/text()')
if len(offsets) > 0:
self._character_offset_end = int(offsets[0])
return self._character_offset_end
@property
[docs] def pos(self):
"""
Lazy-loads the part of speech tag for this word
:getter: Returns the plain string value of the POS tag for the word
:type: str
"""
if self._pos is None:
poses = self._element.xpath('POS/text()')
if len(poses) > 0:
self._pos = poses[0]
return self._pos
@property
[docs] def ner(self):
"""
Lazy-loads the NER for this word
:getter: Returns the plain string value of the NER tag for the word
:type: str
"""
if self._ner is None:
ners = self._element.xpath('NER/text()')
if len(ners) > 0:
self._ner = ners[0]
return self._ner
@property
[docs] def speaker(self):
"""
Lazy-loads the speaker for this word
:getter: Returns the plain string value of the speaker tag for the word
:type: str
"""
if self._speaker is None:
speakers = self._element.xpath('Speaker/text()')
if len(speakers) > 0:
self._speaker = speakers[0]
return self._speaker