Source code for graf.graphs

# graf-python: Python GrAF API
#
# Copyright (C) 2014 American National Corpus
# Author: Keith Suderman <suderman@cs.vassar.edu> (Original API)
#         Stephen Matysik <smatysik@gmail.com> (Conversion to Python)
# URL: <http://www.anc.org/>
# For license information, see LICENSE.TXT
#

"""
An annotation graph is a directed graph that represents an annotation of
arbitrary and application dependent size. A graph may cover a sentence,
paragraph, document, or entire corpus. However, to keep processing feasible
graphs are typically relatively small (sentences say) and then combined into
larger graphs as needed.
"""

import sys

from graf.annotations import FeatureStructure, AnnotationList, AnnotationSpace


class IdDict(dict):
    __slots__ = ('_id_field',)

    def __init__(self, data=(), field='id'):
        dict.__init__(self, data)
        self._id_field = field

    def add(self, obj):
        self[getattr(obj, self._id_field)] = obj

    def __iter__(self):
        if hasattr(self, 'itervalues'):
            return self.itervalues()
        elif hasattr(self, 'values'):
            return iter(self.values())

    def __contains__(self, obj):
        return dict.__contains__(self, getattr(obj, self._id_field, obj))


class GraphEdges(IdDict):
    __slots__ = ()

    def add(self, obj):
        IdDict.add(self, obj)
        obj.from_node.out_edges.add(obj)
        obj.to_node.in_edges.add(obj)


class GraphNodes(IdDict):

    def __init__(self):
        IdDict.__init__(self)

    def add(self, obj):
        """Adds the given node or creates one with the given id"""
        if not isinstance(obj, Node):
            obj = Node(obj)

        IdDict.add(self, obj)
        return obj

    def get_or_create(self, id):
        if id in self:
            return self[id]
        else:
            return self.add(id)


class GraphASpaces(IdDict):
    __slots__ = ('_add_hook',)

    def __init__(self, add_hook):
        IdDict.__init__(self, field='as_id')
        self._add_hook = add_hook

    def add(self, obj):
        IdDict.add(self, obj)
        self._add_hook(obj)

    def create(self, as_id):
        res = AnnotationSpace(as_id)
        self.add(res)
        return res


[docs]class Graph(object): """ Class of Graph. """
[docs] def __init__(self): """ Constructor for Graph. """ self.features = FeatureStructure() self.nodes = GraphNodes() self._top_edge_id = 0 self._edge_pos = 0 self.edges = GraphEdges() self.regions = IdDict() self.content = None self.header = GraphHeader() self.annotation_spaces = GraphASpaces(self.header.add_annotation_space) # List that will contain additional/extra information # to the graph source/origins self.additional_information = {}
[docs] def create_edge(self, from_node, to_node, id=None): """Create graf.Edge from id, from_node, to_node and add it to this graf.Graph. Parameters ---------- from_node : graf.Node The start node for the edge. to_node: graf.Node The end node for the edge. id : str, optional An ID for the edge. We will create one if none is given. Returns ------- res : graf.Edge The Edge object that was created. """ if not hasattr(from_node, 'id'): from_node = self.nodes[from_node] if from_node.id not in self.nodes: self.nodes.add(from_node) if not hasattr(to_node, 'id'): to_node = self.nodes[to_node] if to_node.id not in self.nodes: self.nodes.add(to_node) if id is None: while id is None or id in self.edges: id = 'e%d' % self._top_edge_id self._top_edge_id += 1 res = Edge(id, from_node, to_node, self._edge_pos) self._edge_pos += 1 #if not res in self.edges.values(): self.edges.add(res) return res
[docs] def find_edge(self, from_node, to_node): """Search for C{Edge} with its from_node, to_node, either nodes or ids. :param from_node: C{Node} or C{str} :param to_node: C{Node} or C{str} :return: C{Edge} or None """ # resolve ids to nodes if necessary if not isinstance(from_node, Node): from_node = self.nodes[from_node] if not isinstance(to_node, Node): to_node = self.nodes[to_node] if len(from_node.out_edges) < len(to_node.in_edges): for edge in from_node.out_edges: if edge.to_node == to_node: return edge else: for edge in to_node.in_edges: if edge.from_node == from_node: return edge return None
def get_element(self, id): if id in self.nodes: return self.nodes[id] return self.edges[id] def get_region(self, *anchors): anchors = list(anchors) for region in self.regions: if region.anchors == anchors: return region return None @property def root(self): try: if sys.version_info[:2] >= (3, 0): return self.iter_roots().__next__() else: return self.iter_roots().next() except StopIteration: return None @root.setter def root(self, node): # FIXME: how should this interact with node.is_root self.header.clear_roots() if node.id not in self.nodes: raise ValueError('The new root node is not in the graph: %r' % node) self.header.roots.append(node.id) def iter_roots(self): return (self.nodes[id] for id in self.header.roots)
class GraphElement(object): """ Class of edges in Graph: - Each edge maintains the source (from) C{Node} and the destination. (to) C{Node}. - Edges may also contain one or more C{Annotation} objects. """ def __init__(self, id=""): """Constructor for C{GraphElement}. :param id: C{str} """ self.id = id self.visited = False self.annotations = AnnotationList(self, 'element') def __repr__(self): return "GraphElement id = " + self.id @property def is_annotated(self): return bool(self.annotations) def clear(self): self.visited = False def __eq__(self, other): """Comparison of two graph elements by ID. :param o: C{GraphElement} """ if other is None: return False return type(self) is type(other) and self.id == other.id def visit(self): self.visited = True class EdgeList(object): """An append-only structure with O(1) lookup by id or order-index""" __slots__ = ('_by_ind', '_by_id') def __init__(self): self._by_ind = [] self._by_id = {} def add(self, edge): self._by_id[edge.id] = edge self._by_ind.append(edge) def __iter__(self): return iter(self._by_ind) def __len__(self): return len(self._by_ind) def __getitem__(self, sl): """ Returns the edge corresponding to the specified slice/index or raises an IndexError. If the given value is not a slice or int, returns the edge with the given id, or raises a KeyError """ # should ID lookup have preference?? if isinstance(sl, (int, slice)): return self._by_ind[sl] return self._by_id[sl] def __contains__(self, edge): if hasattr(edge, 'id'): edge = edge.id return edge in self._by_id def ids(self): return self._by_id.keys()
[docs]class Node(GraphElement): """ Class for nodes within a C{Graph} instance. Each node keeps a list of in-edges and out-edges. Each collection is backed by two data structures: 1. A list (for traversals) 2. A hash map Nodes may also contain one or more C{Annotation} objects. """
[docs] def __init__(self, id=""): GraphElement.__init__(self, id) self.in_edges = EdgeList() self.out_edges = EdgeList() self.links = []
def __repr__(self): return "NodeID = " + self.id def __lt__(self, other): return self.id < other.id # Relationship to media def add_link(self, link): self.links.append(link) self._add_regions(link) def _add_regions(self, regions): for region in regions: region.nodes.append(self)
[docs] def add_region(self, region): """Adds the given region to the first link for this node""" if self.links: self.links[0].append(region) self._add_regions((region,)) else: self.add_link(Link((region,)))
# Relationship within graph def iter_parents(self): for edge in self.in_edges: res = edge.from_node if res is not None: yield res @property def parent(self): try: if sys.version_info[:2] >= (3, 0): return self.iter_parents().__next__() else: return self.iter_parents().next() except StopIteration: raise AttributeError('%r has no parents' % self) def iter_children(self): for edge in self.out_edges: res = edge.to_node if res is not None: yield res
[docs] def clear(self): """Clears this node's visited status and those of all visited descendents""" self.visited = False for child in self.iter_children(): if child.visited: child.clear()
@property def degree(self): return len(self.in_edges) + len(self.out_edges)
[docs]class Edge(GraphElement): """ Class of edges in Graph: - Each edge maintains the source (from) graf.Node and the destination (to) graf.Node. - Edges may also contain one or more graf.Annotation objects. """
[docs] def __init__(self, id, from_node, to_node, pos=None): """Edge Constructor. Parameters ---------- id : str The ID for the new edge. from_node : graf.Node The source node for the edge. to_node : graf.Node The target node for the edge. pos : int, optional An optional position of the edge in the graph. This will only be used when we render the graf, to make it easier to store an order of the edges. """ GraphElement.__init__(self, id) self.from_node = from_node self.to_node = to_node self.pos = pos
def __repr__(self): return "Edge id = " + self.id
class GraphHeader(object): """ Class that represents the graphHeader of each GrAF file. """ def __init__(self): self.annotation_spaces = {} self.depends_on = [] self.roots = [] def __repr__(self): return "GraphHeader" def add_annotation_space(self, aspace): self.annotation_spaces[aspace.as_id] = aspace def add_dependency(self, type): self.depends_on.append(type) def clear_roots(self): del self.roots[:]
[docs]class StandoffHeader(object): """ Class that represents the primary data document header. The construction of the file is based on the ISO 24612. """
[docs] def __init__(self, version = "1.0.0", **kwargs): """Class's constructor. Parameters ---------- version : str Version of the document header file. filedesc : ElementTree Element with the description of the file. profiledesc : ElementTree Element with the description of the source file. datadesc : ElementTree Element with the description of the annotations. """ self._kwargs = kwargs self.version = version self.filedesc = self._get_key_value('fileDesc') self.profiledesc = self._get_key_value('profilDesc') self.datadesc = self._get_key_value('dataDesc')
def __repr__(self): return "StandoffHeader" def _get_key_value(self, key): if key == 'fileDesc': return FileDesc() if key == 'profilDesc': return ProfileDesc() if key == 'dataDesc': return DataDesc(None) return None
[docs]class FileDesc(object): """ Class that represents the descriptions of the file containing the primary data document. """
[docs] def __init__(self, **kwargs): """Class's constructor. Parameters ---------- titlestmt : str Name of the file containing the primary data document. extent : dict Size of the resource. The keys are 'count' - Value expressing the size. And 'unit' - Unit in which the size of the resource is expressed. Both keys are mandatory. title : str Title of the primary data document. author : dict Author of the primary data document. The keys are 'age' and 'sex'. source : dict Source from which the primary data was obtained. The keys are 'type' - Role or type the source with regard to the document. And 'source'. Both keys are mandatory. distributor : str Distributor of the primary data (if different from source). publisher : str Publisher of the source. pubAddress : str Address of publisher. eAddress : dict Email address, URL, etc. of publisher. The keys are 'email' and 'type' - Type of electronic address, such as email or URL. Both keys are mandatory. pubDate : str Date of original publication. Should use the ISO 8601 format YYYY-MM-DD. idno : dict Identification number for the document. The keys are 'number' and 'type' - Type of the identification number (e.g. ISBN). Both keys are mandatory. pubName : str Name of the publication in which the primary data was originally published (e.g. journal in which it appeared). documentation : str PID where documentation concerning the data may be found. """ self._kwargs = kwargs self.titlestmt = self._get_key_value('titlestmt') self.extent = self._get_key_value('extent') self.title = self._get_key_value('title') self.author = self._get_key_value('author') self.source = self._get_key_value('source') self.distributor = self._get_key_value('distributor') self.publisher = self._get_key_value('publisher') self.pubAddress = self._get_key_value('pubAddress') self.eAddress = self._get_key_value('eAddress') self.pubDate = self._get_key_value('pubDate') self.idno = self._get_key_value('idno') self.pubName = self._get_key_value('pubName') self.documentation = self._get_key_value('documentation')
def __repr__(self): return "FileDesc" def _get_key_value(self, key): if key in self._kwargs: return self._kwargs[key] return None
[docs]class ProfileDesc(object): """ Class that represents the descriptions of the file containing the primary data document. """
[docs] def __init__(self, **kwargs): """Class's constructor. Parameters ---------- catRef : str One or more categories defined in the resource header. subject : str Topic of the primary data. domain : str Primary domain of the data. subdomain : str Subdomain of the data. languages : array_like Array that contains the codes of the language(s) of the primary data. The codes should be in the ISO 639. participants : array_like Array that contains the participants in an interaction. Each person is a dict element and the keys are 'age', 'sex', 'role' and 'id' - Identifier for reference from annotation documents. The 'id' key is mandatory. settings : array_like Array that contains the settings within which a language interaction takes place. Each settings is a dictionary and the keys are 'who', 'time', 'activity' and 'locale'. """ self._kwargs = kwargs self.languages = self._get_key_value('languages') self.catRef = self._get_key_value('catRef') self.subject = self._get_key_value('subject') self.domain = self._get_key_value('domain') self.subdomain = self._get_key_value('subdomain') self.participants = self._get_key_value('participants') self.settings = self._get_key_value('settings')
def __repr__(self): return "ProfileDesc"
[docs] def add_language(self, language_code): """This method is responsible to add the annotations to the list of languages. The language list in this class will represents the language(s) that the primary data use. Parameters ---------- language_code : str ISO 639 code(s) for the language(s) of the primary data. """ self.languages.append(language_code)
[docs] def add_participant(self, id, age=None, sex=None, role=None): """This method is responsible to add the annotations to the list of participants. The parcipant list in this class will represents participants in an interaction with the data manipulated in the files pointed by the header. A participant is a person in this case and it's important and required to give the id. Parameters ---------- id : str Identifier for reference from annotation documents. age : int Age of the speaker. role : str Role of the speaker in the discourse. sex : str One of male, female, unknown. """ participant = {'id': id} if age: participant['age'] = age if sex: participant['sex'] = sex if role: participant['role'] = role self.participants.append(participant)
[docs] def add_setting(self, who, time, activity, locale): """This method is responsible to add the annotations to the list of settings. The setting list in this class will represents the setting or settings within which a language interaction takes place, either as a prose description or a series of setting elements. A setting is a particular setting in which a language interaction takes place. Parameters ---------- who : str Reference to person IDs involved in this interaction. time : str Time of the interaction. activity : str What a participant in a language interaction is doing other than speaking. locale : str Place of the interaction, e.g. a room, a restaurant, a park bench. """ self.settings.append({'who': who, 'time': time, 'activity': activity, 'locale': locale})
def _get_key_value(self, key): if key in self._kwargs: return self._kwargs[key] return None
[docs]class DataDesc(object): """ Class that represents the annotations to the document associated with the primary data document this header describes. """
[docs] def __init__(self, primaryData): """Class's constructor. Parameters ---------- primaryData : dict Provides the location of the primary data document. The keys are 'loc' - relative path or PID of the primary data document, 'loctype' - Indicates whether the primary data path is a fully specified path (PID) or a path relative to the location of this header file, the default is 'relative', the other option is 'URL'. The other key is 'f.id' - File type via reference to definition in the resource header. All keys are mandatory. """ self.primaryData = primaryData self.annotations_list = None
def __repr__(self): return "DataDesc"
[docs] def add_annotation(self, loc, fid, loctype="relative"): """This method is responsible to add the annotations to the list of annotations. The annotations list in this class will represents the documents associated with the primary data document that this header will describe. Parameters ---------- loc : str Relative path or PID of the annotation document. fid : str File type via reference to definition in the resource header. loctype : str Indicates whether the path is a fully specified path or a path relative to the header file. """ if self.annotations_list is None: self.annotations_list = [] value = {'loc': loc, 'loctype': loctype, 'f.id': fid} if value not in self.annotations_list: self.annotations_list.append({'loc': loc, 'loctype': loctype, 'f.id': fid})
class RevisonDesc(): """ Class that represents the changes made in a specific of the primary data document header. """ def __init__(self, changes=None): """Class's constructor. Parameters ---------- changes : array_like Array that contains a list of changes. Each change is a dictionary. The keys are 'changedate', 'respname' and 'item': All keys are mandatory. """ self.changes = changes def __repr__(self): return "RevisonDesc" def add_change(self, changedate, respname, item): """This method is responsible to add the annotations to the list of changes. The changes list in this class will represents the information about a particular change made to the document. Parameters ---------- changedate : str Date of the change in ISO 8601 format. responsible : str Identification of the person responsible for the change. item : str Description of the change. """ self.changes.append({'changedate': changedate, 'respname': respname, 'item': item})