Source code for pyconll.unit.token

"""
Defines the Token type and parsing and output logic. A Token is the based unit
in CoNLL-U and so the data and parsing in this module is central to the CoNLL-U
format.
"""

import functools
import math
from typing import Callable, ClassVar, Dict, Optional, Set, Tuple

from pyconll.conllable import Conllable
from pyconll.exception import FormatError, ParseError


def _unit_empty_map(value, empty):
    """
    Map unit values for CoNLL-U columns to a string or None if empty.

    Args:
        value: The value to map.
        empty: The empty representation for this unit.

    Returns:
        None if value is empty and value otherwise.
    """
    return None if value == empty else value


def _dict_empty_map_parser(v, v_delimiter):
    """
    Map a value into the appropriate form, for a standard dict based column.

    Args:
        v: The raw string value that was parsed from the column as a value.
        v_delimiter: The delimiter between components of the value.

    Returns:
        The parsed value, as a set of its components.

    Raises:
        ParseError: If there was an error parsing the value. This happens when
            the value is None.
    """
    if v is not None:
        vs = set(v.split(v_delimiter))
        return vs

    error_msg = f'Error parsing "{v}" properly. Please check against CoNLL format spec.'
    raise ParseError(error_msg)


def _dict_empty_map(values, empty, delim, av_separator, v_delimiter):
    """
    Map dict values for CoNLL-U columns to a dict or empty dict if empty.

    Args:
        values: The value to check for existence.
        empty: The empty representation for this dict.
        delim: The delimiter between components in the provided value.
        av_separator: The attribute-value separator for each component.
        v_delimiter: The delimiter between values for the same attribute.

    Returns:
        An empty dict if value is empty. Otherwise, a dict of key-value where
        the values are sets.

    Raises:
        ParseError: If the dict format was unable to parsed, because of a lack
            of a value.
    """
    return _dict_empty_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, _dict_empty_map_parser)


def _create_dict_tupled_empty_parse(size, strict):
    """
    Parameterized creation of a parser for tupled values.

    Args:
        size: The expected size of the tuple.
        strict: Flag to signifiy if parsed values with less components than size
            will be accepted. In this case, missing values will be supplated
            with None.

    Returns:
        The parameterized parser function for parsing tupled columns.

    Raises:
        ParseError: If the parsing is strict and there is a component size
            mismatch, or if there are too many components in general.
    """
    def _dict_tupled_empty_parser(v, v_delimiter):
        """
        Map a value into the appropriate form, for a tupled based column.

        Args:
            v: The raw string value parsed from a column.
            v_delimiter: The delimiter between components of the value.

        Returns:
            The parsed value as a tuple.

        Raises:
            ParseError: If there was an error parsing the value as a tuple.
        """
        if v is None:
            raise ParseError(
                f'Error parsing "{v}" as tuple properly. Please check against CoNLL'
                ' format spec')

        components = v.split(v_delimiter)
        left = size - len(components)

        if not strict and 0 <= left < size:
            vs = tuple(components + [None] * left)
        else:
            raise ParseError(
                f'Error parsing "{v}" as tuple properly. Please check against CoNLL'
                ' format spec.')

        return vs

    return _dict_tupled_empty_parser


TUPLE_PARSER_MEMOIZE: Dict[int, Callable[[str, str], Tuple[Optional[str],
                                                           ...]]] = {}


def _dict_tupled_empty_map(values, empty, delim, av_separator, v_delimiter,
                           size):
    """
    Map dict based values for CoNLL-U columns to a dict with tupled values.

    Tupled values are those with a maximum number of components, which is also
    greater than 1.

    Args:
        values: The value to parse.
        empty: The empty representation for this value in CoNLL format.
        delim: The delimiter between components in the value.
        av_separator: The separator between the attribute and value in each
            component.
        v_delimiter: The delimiter between values for the same attribute.
        size: The maximum size of the tuple. Components with less values than
            this will be supplanted with None.

    Returns:
        An empty dict if values was empty. Otherwise, a dictionary with fixed
        length tuples as the values.

    Raises:
        ParseError: If there was an error parsing the tuple. Related to the
            number of components.
    """
    try:
        parser = TUPLE_PARSER_MEMOIZE[size]
    except KeyError:
        parser = _create_dict_tupled_empty_parse(size, False)
        TUPLE_PARSER_MEMOIZE[size] = parser

    return _dict_empty_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, parser)


def _dict_mixed_empty_parser(v, v_delimiter):
    """
    Parse a value into the appropriate form, for a mixed value based column.

    Args:
        v: The raw string value parsed from a column.
        v_delimiter: The delimiter between components of the value.

    Returns:
        The parsed value, which can either be None in the case of no
        corresponding value, or a set of the components in the value.
    """
    if v is None:
        return v

    vs = set(v.split(v_delimiter))
    return vs


def _dict_mixed_empty_map(values, empty, delim, av_separator, v_delimiter):
    """
    Map dict based values for CoNLL-U columns to dict with mixed values.

    Mixed values are those that can be either singletons or sets.

    Args:
        values: The value to parse.
        empty: The empty representation for this value in CoNLL-U format.
        delim: The delimiter between components in the value.
        av_separator: The separator between attribute and value in each
            component.
        v_delimiter: The delimiter between values for the same attribute.

    Returns:
        An empty dict if values is empty. Otherwise, a dictionary with either
        None, or a set of strings as the values.

    Raises:
        ParseError: If the dict format was unable to parsed.
    """
    return _dict_empty_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, _dict_mixed_empty_parser)


def _dict_empty_map_helper(values, empty, delim, av_separator, v_delimiter,
                           parser):
    """
    A helper to consolidate logic between singleton and non-singleton mapping.

    Args:
        values: The value to parse.
        empty: The empty representation for this value in CoNLL-U format.
        delim: The delimiter between components of the value.
        av_separator: The separator between attribute and value in each
            component.
        v_delimiter: The delimiter between values for the same attribute.
        parser: The parser of the value from the attribute value pair.

    Returns:
        An empty dict if the value is empty and otherwise a parsed equivalent.

    Raises:
        ParseError: If the dict format was unable to parsed. This error will be
            raised by the provided parser.
    """
    if values == empty:
        return {}

    d = {}
    for el in values.split(delim):
        parts = el.split(av_separator, 1)
        if len(parts) == 1 or (len(parts) == 2 and parts[1] == ''):
            k = parts[0]
            v = None
        elif len(parts) == 2:
            k, v = parts

        parsed = parser(v, v_delimiter)
        d[k] = parsed

    return d


def _unit_conll_map(value, empty):
    """
    Map a unit value to its CoNLL-U format equivalent.

    Args:
        value: The value to convert to its CoNLL-U format.
        empty: The empty representation for a unit in CoNLL-U.

    Returns:
        empty if value is None and value otherwise.
    """
    return empty if value is None else value


def _dict_conll_map_formatter(v, v_delimiter):
    """
    Formatter to convert a set of CoNLL values to a string representation.

    Args:
        v: The set of values.
        v_delimiter: The delimiter between the values in a string representation.

    Returns:
        The appropriate string representation of the value in CoNLL format.

    Raises:
        FormatError: When there are no values to output.
    """
    if v:
        sorted_vs = sorted(v, key=str.lower)
        str_vs = v_delimiter.join(sorted_vs)
    else:
        error_msg = 'There are no values to format.'
        raise FormatError(error_msg)

    return str_vs


def _dict_conll_map(values, empty, delim, av_separator, v_delimiter, av_key):
    """
    Map a dict whose attributes can have multiple values to CoNLL-U format.

    This CoNLL-U format will be sorted alphabetically by attribute and
    attributes with more than one value will have values sorted
    alphabetically.

    Args:
        values: The dict to convert to its CoNLL-U format.
        empty: The empty representation for a dict in CoNLL-U.
        delim: The delimiter between components in the output.
        av_separator: The attribute-value separator in the provided string.
        v_delimiter: The delimiter between values in attribute-value pairs.
        av_key: The key for attribute-value pairs ordering, on output.

    Returns:
        The CoNLL-U format as a string.
    """
    return _dict_conll_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, _dict_conll_map_formatter,
                                  av_key)


def _dict_tupled_conll_formatter(v, v_delimiter):
    """
    Convert a tuple of values into a CoNLL string representation.

    Args:
        v: The tuple of values to convert.
        v_delimiter: The delimiter between values in the ConLL representation.

    Returns:
        The string representation of the tuple in CoNLL format.

    Raises:
        FormatError: When all values in the tuple value are None.
    """
    presents = list(filter(lambda el: el is not None, v))
    if not presents:
        error_msg = 'All values in the tuple are None.'
        raise FormatError(error_msg)

    form = v_delimiter.join(presents)
    return form


def _dict_tupled_conll_map(values, empty, delim, av_separator, v_delimiter,
                           av_key):
    """
    Map a dict whose components are max length tuples to a CoNLL format.

    Args:
        values: The dict to convert.
        empty: The empty CoNLL representation for this value.
        delim: The delimiter between attribute-value pairs.
        av_separator: The separator between the attribute and value.
        v_delimiter: The delimiter between values of the same attribute.
        av_key: The ordering for the attribute-value pairs, on output.

    Returns:
        The CoNLL formatted equivalent to the provided value as a tupled column.

    Raises:
        FormatError: If there was an error converting a tuple to a CoNLL format.
    """
    return _dict_conll_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, _dict_tupled_conll_formatter,
                                  av_key)


def _dict_mixed_conll_formatter(v, v_delimiter):
    """
    Format a mixed value into a string representation.

    Args:
        v: The value to convert to a mixed CoNLL string format.
        v_delimiter: The delimiter between values in the string.

    Returns:
        A CoNLL representation of the mixed value.
    """
    if v is None:
        return v

    sorted_vs = sorted(v, key=str.lower)
    str_vs = v_delimiter.join(sorted_vs)

    return str_vs


def _dict_mixed_conll_map(values, empty, delim, av_separator, v_delimiter,
                          av_key):
    """
    Map a dict whose components can be mixed to a CoNLL-U format.

    Args:
        values: The dict to convert to CoNLL-U format.
        empty: The empty CoNLL-U representation for this value.
        delim: The delimiter between attribute-value pairs.
        av_separator: The separator between attribute and value.
        v_delimiter: The delimiter between values of the same attribute if
            necessary.
        av_key: The ordering for the attribute-value pairs, on output.

    Returns:
        The CoNLL-U formatted equivalent to the value.
    """
    return _dict_conll_map_helper(values, empty, delim, av_separator,
                                  v_delimiter, _dict_mixed_conll_formatter,
                                  av_key)


def _dict_conll_map_helper(values, empty, delim, av_separator, v_delimiter,
                           formatter, av_key):
    """
    Helper to map dicts to CoNLL-U format equivalents.

    Args:
        values: The value, dict, to map.
        empty: The empty CoNLL-U representation for this value.
        delim: The delimiter between attribute-value pairs.
        av_separator: The separator between attribute and value.
        v_delimiter: The delimiter between values of the same attribute if
            necessary.
        formatter: The function to convert an attribute value pair into a CoNLL
            column representation. This function should take in a value
            representation, and a value delimiter, and output a string
            representation. It should also output None to mean there is no
            string representation.
        av_key: The sorting key for the attribute-value pairs on output.

    Returns:
        The CoNLL-U formatted equivalent to the value.
    """
    def paramed(pair):
        f = formatter(pair[1], v_delimiter)
        if f is None:
            return (pair[0], )

        return (pair[0], f)

    if values == {}:
        return empty

    sorted_av_pairs = sorted(values.items(), key=av_key)
    formatted = map(paramed, sorted_av_pairs)
    output = delim.join([av_separator.join(form) for form in formatted])

    return output if output else empty


@functools.total_ordering
class _TokenIdComparer:
    """
    Implementation class for comparing token ids to be sorted, using the
    standard python sorting routines. Two ids are equal if the id text is
    exactly the same. Otherwise, ids are compared by parts, with a range id
    being compared by the start index and then by the end index, and decimal
    ids having the radix separated parts compared separately.
    """
    def __init__(self, token_id):
        """
        Create the comparer wrapping the given, assumed valid format, id.

        Args:
            token_id: The token id to wrap.
        """
        self.token_id = token_id

    def __eq__(self, other):
        """
        Compares if the current id is equal to the provided id.

        Args:
            other: The other id wrapper to compare against.

        Returns:
            True if the two underlying ids are exactly the same.
        """
        return self.token_id == other.token_id

    def __ne__(self, other):
        """
        Compares if the current id is not equal to the provided id.

        Args:
            other: The other id wrapper to compare against.

        Returns:
            True if the two underlying ids are not the same.
        """
        return not self == other

    @staticmethod
    def _split_token_id_as_range(token_id):
        """
        Splits a token into its individual parts as a range.

        If the token does not represent a range, the beginning and end are the
        same token id.

        Args:
            token_id: The id to split into its range parts.

        Returns:
            A tuple of size two with the token id representing the range.
        """
        idx = token_id.find('-')
        if idx < 0:
            ranges = (token_id, token_id)
        else:
            ranges = (token_id[:idx], token_id[idx + 1:])

        return ranges

    @staticmethod
    def _split_by_radix(token_id):
        """
        Split a non-range token id by the radix point.

        Any id without a radix point will assume it is 0.

        Args:
            token_id: The id to decompose into its two parts based on the radix.

        Returns:
            A tuple of size 2 with the id parts decomposed as integers.
        """
        idx = token_id.find('.')
        if idx < 0:
            return [int(token_id), 0]

        first = int(token_id[:idx])
        second = int(token_id[idx + 1:])
        return [first, second]

    @staticmethod
    def _cmp_individual_ids(a, b):
        """
        Compare two non-range token ids using a traditional compare function.

        Args:
            a: The first argument to compare and the basis of the comparison.
            b: The second argument to compare against.

        Returns:
            The results of a traditional compare function in basis of a to b.
        """
        a_l, a_r = _TokenIdComparer._split_by_radix(a)
        b_l, b_r = _TokenIdComparer._split_by_radix(b)

        return _TokenIdComparer._zcopysign(2, a_l - b_l) + \
               _TokenIdComparer._zcopysign(1, a_r - b_r)

    @staticmethod
    def _zcopysign(a, b):
        """
        copysign that returns the sign indicator if it is 0 or -0.

        Args:
            a: The magnitude.
            b: The sign indicator.

        Returns:
            Usual copysign result unless sign indicator is 0.
        """
        if not b:
            return b

        return math.copysign(a, b)

    def __lt__(self, other):
        """
        Compares if other is less than the currently wrapped id.

        Args:
            other: The wrapped id to compare against.

        Returns:
            True if the current id is less than the id wrapped by other.
        """
        self_split_1, self_split_2 = _TokenIdComparer._split_token_id_as_range(
            self.token_id)
        other_split_1, other_split_2 = _TokenIdComparer._split_token_id_as_range(
            other.token_id)

        first_cmp = _TokenIdComparer._cmp_individual_ids(
            self_split_1, other_split_1)

        return first_cmp < 0 or (first_cmp == 0
                                 and _TokenIdComparer._cmp_individual_ids(
                                     self_split_2, other_split_2))


[docs]class Token(Conllable):
    """
    A token in a CoNLL-U file. This consists of 10 columns, each separated by
    a single tab character and ending in an LF ('\\n') line break. Each of the 10
    column values corresponds to a specific component of the token, such as id,
    word form, lemma, etc.

    This class does not do any formatting validation on input or output. This
    means that invalid input may be properly processed and then output. Or that
    client changes to the token may result in invalid data that can then be
    output. Properly formatted CoNLL-U will always work on input and as long as
    all basic units are strings output will work as expected. The result may
    just not be proper CoNLL-U.

    Also note that the word form for a token is immutable. This is because
    CoNLL-U is inherently interested in annotation schemes and not storing
    sentences.
    """

    __slots__ = [
        'id', '_form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel',
        'deps', 'misc'
    ]

    # The different delimiters and separators for the CoNLL-U format.
    # FIELD_DELIMITER separates columns on the token line.
    # COMPONENT_DELIMITER separates a field with multiple components.
    # AV_SEPARATOR separates the attribute from the value in a component.
    # V_DELIMITER separates the values in an attribute-value pair.
    FIELD_DELIMITER: ClassVar[str] = '\t'
    COMPONENT_DELIMITER: ClassVar[str] = '|'
    AV_SEPARATOR: ClassVar[str] = '='
    AV_DEPS_SEPARATOR: ClassVar[str] = ':'
    V_DELIMITER: ClassVar[str] = ','
    V_DEPS_DELIMITER: ClassVar[str] = ':'
    EMPTY: ClassVar[str] = '_'

    # Keys for sorting attribute-value columns. BY_ID converts the attribute
    # value pair to the integer value of the attribute, and BY_CASE_SENSITIVE
    # converts the pair to the lowercase version of the attribute.
    BY_ID: ClassVar[Callable[[Tuple[
        str, str]], _TokenIdComparer]] = lambda pair: _TokenIdComparer(pair[0])
    BY_CASE_INSENSITIVE: ClassVar[Callable[[Tuple[
        str, str]], str]] = lambda pair: pair[0].lower()

[docs]    def __init__(self, source: str, empty: bool = False) -> None:
        """
        Construct a Token from the given source line.

        A Token line ends in an an LF line break according to the CoNLL-U
        specification. However, this method accepts a line with or without the
        LF line break.

        On parsing, a '_' in the form and lemma is ambiguous and either refers
        to an empty value or to an actual underscore. The empty parameter flag
        controls how this situation should be handled.

        This method also guarantees properly processing valid input, but invalid
        input may not be parsed properly. Some inputs that do not follow the
        CoNLL-U specification may still be parsed properly and as expected. So
        proper parsing is not an indication of validity.

        Args:
            line: The line that represents the Token in CoNLL-U format.
            empty: A flag to control if the word form and lemma can be assumed
                to be empty and not the token signifying empty. If both the form
                and lemma are underscores and empty is set to False (there is no
                empty assumption), then the form and lemma will be underscores
                rather than None.

        Raises:
            ParseError: On various parsing errors, such as not enough columns or
                improper column values.
        """
        if source[-1] == '\n':
            source = source[:-1]

        fields = source.split(Token.FIELD_DELIMITER)

        if len(fields) != 10:
            error_msg = f'The number of columns per token line must be 10. Invalid token: {source}'
            raise ParseError(error_msg)

        # Assign all the field values from the line to internal equivalents.
        self.id: str = fields[0]

        # If we can assume the form and lemma are empty, or if either of the
        # fields are not the empty token, then we can proceed as usual.
        # Otherwise, these empty tokens might not mean empty, but rather the
        # actual tokens.
        if empty or (fields[1] != Token.EMPTY or fields[2] != Token.EMPTY):
            self._form: Optional[str] = _unit_empty_map(fields[1], Token.EMPTY)
            self.lemma: Optional[str] = _unit_empty_map(fields[2], Token.EMPTY)
        else:
            self._form = fields[1]
            self.lemma = fields[2]

        self.upos: Optional[str] = _unit_empty_map(fields[3], Token.EMPTY)
        self.xpos: Optional[str] = _unit_empty_map(fields[4], Token.EMPTY)
        self.feats: Dict[str,
                         Set[str]] = _dict_empty_map(fields[5], Token.EMPTY,
                                                     Token.COMPONENT_DELIMITER,
                                                     Token.AV_SEPARATOR,
                                                     Token.V_DELIMITER)
        self.head: Optional[str] = _unit_empty_map(fields[6], Token.EMPTY)
        self.deprel: Optional[str] = _unit_empty_map(fields[7], Token.EMPTY)
        self.deps: Dict[str,
                        Tuple[str, str, str, str]] = _dict_tupled_empty_map(
                            fields[8], Token.EMPTY, Token.COMPONENT_DELIMITER,
                            Token.AV_DEPS_SEPARATOR, Token.V_DEPS_DELIMITER, 4)
        self.misc: Dict[str, Optional[Set[str]]] = _dict_mixed_empty_map(
            fields[9], Token.EMPTY, Token.COMPONENT_DELIMITER,
            Token.AV_SEPARATOR, Token.V_DELIMITER)

    @property
    def form(self) -> Optional[str]:
        """
        Provide the word form of this Token. This property is read only.

        Returns:
            The Token form.
        """
        return self._form

[docs]    def is_multiword(self) -> bool:
        """
        Checks if this Token is a multiword token.

        Returns:
            True if this token is a multiword token, and False otherwise.
        """
        return '-' in self.id

[docs]    def is_empty_node(self) -> bool:
        """
        Checks if this Token is an empty node, used for ellipsis annotation.

        Note that this is separate from any field being empty, rather it means
        the id has a period in it.

        Returns:
            True if this token is an empty node and False otherwise.
        """
        return '.' in self.id

[docs]    def conll(self) -> str:
        """
        Convert this Token to its CoNLL-U representation.

        A Token's CoNLL-U representation is a line. Note that this method does
        not include a newline at the end.

        Returns:
            A string representing the Token in CoNLL-U format.

        Raises:
            FormatError: If the Token can not be converted to the CoNLL format.
        """
        # Transform the internal CoNLL-U representations back to text and
        # combine the fields.
        token_id = self.id
        form = _unit_conll_map(self.form, Token.EMPTY)
        lemma = _unit_conll_map(self.lemma, Token.EMPTY)
        upos = _unit_conll_map(self.upos, Token.EMPTY)
        xpos = _unit_conll_map(self.xpos, Token.EMPTY)
        feats = _dict_conll_map(self.feats, Token.EMPTY,
                                Token.COMPONENT_DELIMITER, Token.AV_SEPARATOR,
                                Token.V_DELIMITER, Token.BY_CASE_INSENSITIVE)
        head = _unit_conll_map(self.head, Token.EMPTY)
        deprel = _unit_conll_map(self.deprel, Token.EMPTY)
        deps = _dict_tupled_conll_map(self.deps, Token.EMPTY,
                                      Token.COMPONENT_DELIMITER,
                                      Token.AV_DEPS_SEPARATOR,
                                      Token.V_DEPS_DELIMITER, Token.BY_ID)
        misc = _dict_mixed_conll_map(self.misc, Token.EMPTY,
                                     Token.COMPONENT_DELIMITER,
                                     Token.AV_SEPARATOR, Token.V_DELIMITER,
                                     Token.BY_CASE_INSENSITIVE)

        items = [
            token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc
        ]

        return Token.FIELD_DELIMITER.join(items)