Source code for pyconll.unit.token

import operator


def _unit_empty_map(value, empty):
    """
    Map unit values for CoNLL-U columns to a string or None if empty.

    Args:
    value: The value to map.
    empty: The empty representation for this unit.

    Returns:
    None if value is empty and value otherwise.
    """
    return None if value == empty else value


def _dict_empty_map(values, empty, delim, av_separator, v_delimiter):
    """
    Map dict values for CoNLL-U columns to a dict or empty dict if empty.

    Args:
    values: The value to check for existence.
    empty: The empty representation for this dict.
    delim: The delimiter between components in the provided value.
    av_separator: The attribute-value separator for each component.
    v_delimiter: The delimiter between values for the same attribute.

    Returns:
    An empty dict if value is empty. Otherwise, a dict of key-value where the
    values are sets.
    """
    return _dict_empty_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, False)


def _dict_singleton_empty_map(values, empty, delim, av_separator):
    """
    Map dict based values for CoNLL-U columns to dict with singleton values.

    Args:
    values: The value to parse.
    empty: The empty representation for this value in CoNLL-U format.
    delim: The delimiter between components in the value.
    av_separator: The separator between attribute and value in each component.

    Returns:
    An empty dict if values is empty. Otherwise, a dict of key-value pairs where
    the values are singletons.
    """
    return _dict_empty_map_helper(values, empty, delim, av_separator, None,
                                  True)


def _dict_empty_map_helper(values, empty, delim, av_separator, v_delimiter,
                           singleton):
    """
    A helper to consolidate logic between singleton and non-singleton mapping.

    Args:
    values: The value to parse.
    empty: The empty representation for this value in CoNLL-U format.
    delim: The delimiter between components of the value.
    av_separator: The separator between attribute and value in each component.
    v_delimiter: The delimiter between values for the same attribute.
    singleton: A flag to indicate if the value has singleton values or not.

    Returns:
    An empty dict if the value is empty and otherwise a parsed equivalent.
    """
    if values == empty:
        return {}
    else:
        d = {}
        for el in values.split(delim):
            parts = el.split(av_separator)
            if len(parts) == 1:
                k = parts[0]
                v = None
            elif len(parts) == 2:
                k, v = parts

            if singleton or v is None:
                d[k] = v
            else:
                v = set(v.split(v_delimiter))
                d[k] = v

        return d


def _unit_conll_map(value, empty):
    """
    Map a unit value to its CoNLL-U format equivalent.

    Args:
    value: The value to convert to its CoNLL-U format.
    empty: The empty representation for a unit in CoNLL-U.

    Returns:
    empty if value is None and value otherwise.
    """
    return empty if value is None else value


def _dict_conll_map(values, empty, delim, av_separator, v_delimiter):
    """
    Map a dict whose attributes can have multiple values to CoNLL-U format.

    This CoNLL-U format will be sorted alphabetically by attribute and
    attributes with more than one value will have values sorted
    alphabetically.

    Args:
    values: The dict to convert to its CoNLL-U format.
    empty: The empty representation for a dict in CoNLL-U.
    delim: The delimiter between components in the output.
    av_separator: The attribute-value separator in the provided string.
    v_delimiter: The delimiter between values in attribute-value pairs.

    Returns:
    The CoNLL-U format as a string.
    """
    return _dict_conll_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, False)


def _dict_singleton_conll_map(values, empty, delim, av_separator):
    """
    Map a dict whose attributes can only have one value to CoNLL-U format.

    Args:
    values: The dict to convert to CoNLL-U format.
    empty: The empty CoNLL-U representation for this value.
    delim: The delimiter between attribute-value pairs.
    av_separator: The separator between attribute and value.

    Returns:
    The CoNLL-U formatted equivalent to the value.
    """
    return _dict_conll_map_helper(values, empty, delim, av_separator, None,
                                  True)


def _dict_conll_map_helper(values, empty, delim, av_separator, v_delimiter,
                           singleton):
    """
    Helper to map dicts to CoNLL-U format equivalents.

    Args:
    values: The value, dict, to map.
    empty: The empty CoNLL-U reprsentation for this value.
    delim: The delimiter between attribute-value pairs.
    av_separator: The separator between attribute and value.
    v_delimiter: The delimiter between values of the same attribute if
        necessary.
    singleton: Flag to indicate if the dictionary values are singletons or
        collections.

    Returns:
    The CoNLL-U formatted equivalent to the value.
    """
    if values == {}:
        return empty
    else:
        # TODO: what if one of values is None
        sorted_av_pairs = sorted(values.items(), key=operator.itemgetter(0))

        if singleton:
            av_pairs = sorted_av_pairs
        else:
            av_pairs = []
            for pair in sorted_av_pairs:
                sorted_attr_values = sorted(pair[1], key=str.lower)
                str_attrs = v_delimiter.join(sorted_attr_values)

                av_pairs.append([pair[0], str_attrs])

        return delim.join([av_separator.join(pair) for pair in av_pairs])


[docs]class Token:
    # TODO: Allow custom, token parsing for misc field if necessary somehow.
    """
    A token in a CoNLL-U file. This consists of 10 columns, each separated by
    a single tab character and ending in an LF ('\\n') line break. Each of the 10
    column values corresponds to a specific component of the token, such as id,
    word form, lemma, etc.

    This class does not do any formatting validation on input or output. This
    means that invalid input may be properly processed and then output. Or that
    client changes to the token may result in invalid data that can then be
    output. Properly formatted CoNLL-U will always work on input and as long as
    all basic units are strings output will work as expected. The result may
    just not be proper CoNLL-U.

    Also note that the word form for a token is immutable. This is because
    CoNLL-U is inherently interested in annotation schemes and not storing
    sentences.
    """

    # The different delimiters and separators for the CoNLL-U format.
    # FIELD_DELIMITER separates columns on the token line.
    # COMPONENT_DELIMITER separates a field with multiple components.
    # AV_SEPARATOR separates the attribute from the value in a component.
    # V_DELIMITER separates the values in an attribute-value pair.
    FIELD_DELIMITER = '\t'
    COMPONENT_DELIMITER = '|'
    AV_SEPARATOR = '='
    AV_DEPS_SEPARATOR = ':'
    V_DELIMITER = ','
    EMPTY = '_'

[docs]    def __init__(self, source, empty=True, _line_number=None):
        """
        Construct the token from the given source.

        A Token line must end in an an LF line break according to the
        specification. However, this method will accept a line with or without
        this ending line break.

        Further, a '_' that appears in the form and lemma is ambiguous and can
        either refer to an empty value or an actual underscore. So the flag
        empty_form allows for control over this if it is known from outside
        information. If, the token is a multiword token, all fields except for
        form should be empty.

        Note that no validation is done on input. Valid input will be processed
        properly, but there is no guarantee as to invalid input that does not
        follow the CoNLL-U specifications.

        Args:
        line: The line that represents the Token in CoNLL-U format.
        empty: A flag to signify if the word form and lemma can be assumed to be
            empty and not the token signifying empty. Only if both the form and
            lemma are both the same token as empty and there is no empty
            assumption, will they not be assigned to None.
        _line_number: The line number for this Token in a CoNLL-U file. For
            internal use mostly.

        Raises:
        ValueError if the provided source is not composed of 10 tab separated
        columns.
        """
        if source[-1] == '\n':
            source = source[:-1]
        self._source = source

        self.line_number = _line_number

        fields = source.split(Token.FIELD_DELIMITER)
        self._fields = fields

        if len(self._fields) != 10:
            raise ValueError('The number of columns per token line is 10')

        # Assign all the field values from the line to internal equivalents.
        self.id = fields[0]

        # If we can assume the form and lemma are empty, or if either of the
        # fields are not the empty token, then we can proceed as usual.
        # Otherwise, these empty tokens might not mean empty, but rather the
        # actual tokens.
        if empty or (fields[1] != Token.EMPTY or fields[2] != Token.EMPTY):
            self._form = _unit_empty_map(fields[1], Token.EMPTY)
            self.lemma = _unit_empty_map(fields[2], Token.EMPTY)
        elif fields[1] == Token.EMPTY and fields[2] == Token.EMPTY:
            self._form = fields[1]
            self.lemma = fields[2]

        self.upos = _unit_empty_map(fields[3], Token.EMPTY)
        self.xpos = _unit_empty_map(fields[4], Token.EMPTY)
        self.feats = _dict_empty_map(fields[5], Token.EMPTY,
                                     Token.COMPONENT_DELIMITER,
                                     Token.AV_SEPARATOR, Token.V_DELIMITER)
        self.head = _unit_empty_map(fields[6], Token.EMPTY)
        self.deprel = _unit_empty_map(fields[7], Token.EMPTY)
        self.deps = _dict_singleton_empty_map(fields[8], Token.EMPTY,
                                              Token.COMPONENT_DELIMITER,
                                              Token.AV_DEPS_SEPARATOR)
        # TODO: Handle misc field better. I'm not sure if it has to be key-value
        # structure.
        self.misc = _dict_empty_map(fields[9], Token.EMPTY,
                                    Token.COMPONENT_DELIMITER,
                                    Token.AV_SEPARATOR, Token.V_DELIMITER)

    @property
    def form(self):
        """
        Provide the word form of this Token. This property makes it readonly.

        Returns:
        The Token wordform.
        """
        return self._form

[docs]    def is_multiword(self):
        """
        Checks if this token is a multiword token.

        Returns:
        True if this token is a multiword token, and False otherwise.
        """
        return '-' in self.id

[docs]    def conll(self):
        """
        Convert Token to the CoNLL-U representation.

        Note that this does not include a newline at the end.

        Returns:
        A string representing the token as a line in a CoNLL-U file.
        """
        # Transform the internal CoNLL-U representations back to text and
        # combine the fields.
        id = self.id
        form = _unit_conll_map(self.form, Token.EMPTY)
        lemma = _unit_conll_map(self.lemma, Token.EMPTY)
        upos = _unit_conll_map(self.upos, Token.EMPTY)
        xpos = _unit_conll_map(self.xpos, Token.EMPTY)
        feats = _dict_conll_map(self.feats, Token.EMPTY,
                                Token.COMPONENT_DELIMITER, Token.AV_SEPARATOR,
                                Token.V_DELIMITER)
        head = _unit_conll_map(self.head, Token.EMPTY)
        deprel = _unit_conll_map(self.deprel, Token.EMPTY)
        deps = _dict_singleton_conll_map(self.deps, Token.EMPTY,
                                         Token.COMPONENT_DELIMITER,
                                         Token.AV_DEPS_SEPARATOR)
        misc = _dict_conll_map(self.misc, Token.EMPTY,
                               Token.COMPONENT_DELIMITER, Token.AV_SEPARATOR,
                               Token.V_DELIMITER)

        items = [id, form, lemma, upos, xpos, feats, head, deprel, deps, misc]

        return Token.FIELD_DELIMITER.join(items)

[docs]    def __eq__(self, other):
        """
        Test if this Token is equal to other.

        Args:
        other: The other token to compare against.

        Returns:
        True if the this Token and the other are the same. Two tokens are
        considered the same when all columns are the same.
        """
        return self._source == other._source