Source code for pyconll.format

"""
Format module providing consolidated interfaces for CoNLL data parsing and serialization.

This module defines three classes:
- ReadFormat: For read-only parsing operations
- WriteFormat: For write-only serialization operations
- Format: Combines both reading and writing capabilities

For typical use cases where both read and write operations are needed, use Format.
For specialized read-only or write-only scenarios, use ReadFormat or WriteFormat directly.
"""

import io
import os
import string
from typing import IO, Iterable, Iterator, Optional

from pyconll import _compile
from pyconll.exception import ParseError
from pyconll.schema import AbstractSentence, FieldDescriptor

PathLike = str | bytes | os.PathLike


def _pair_down_whitespace(
    line: str, start_idx: int, end_idx: Optional[int] = None
) -> Optional[str]:
    """
    Remove whitespace from the delimited beginning and end regions of the string.

    Args:
        line: The string to remove whitespace from.
        start_idx: The location from which to start removing whitespace.
        end_idx: The location to move back from for removing whitespace. If not provided, it same as
            the length of the string.

    Returns:
        The string without whitespace surrounding it or None if the string was entirely whitespace.
    """
    end_idx = len(line) if end_idx is None else end_idx

    while start_idx < end_idx and line[start_idx] in string.whitespace:
        start_idx += 1

    if start_idx == end_idx:
        return None

    while (end_idx - 1) > start_idx and line[end_idx - 1] in string.whitespace:
        end_idx -= 1

    return line[start_idx:end_idx]


[docs] class ReadFormat[T, S: AbstractSentence]: """ A read-only interface for parsing CoNLL formatted data. This class wraps Parser functionality and provides methods to parse CoNLL data from various sources including strings, files, and IO resources. Use this when only parsing operations are needed. """
[docs] def __init__( self, token_schema: type[T], sentence_schema: type[S], comment_marker: str = "#", delimiter: str = "\t", collapse_delimiters: bool = False, field_descriptors: Optional[dict[str, Optional[FieldDescriptor]]] = None, extra_primitives: Optional[set[type]] = None, ) -> None: """ Initialize the read format handler. Args: token_schema: The Token type to use for parsing. sentence_schema: The Sentence type to use for parsing. comment_marker: The character that marks the beginning of comments. Defaults to '#'. delimiter: The delimiter between the columns on a token line. Defaults to tab. collapse_delimiters: Flag if sequential delimiters denote an empty value or should be collapsed into one larger delimiter. Defaults to False. field_descriptors: The descriptors for the fields on the schema as a mapping from the field name to the descriptor instance. For primitive types, use None as the descriptor. This takes precedence over anything on the type itself. extra_primitives: The set of types to consider as primitives (default construction and the str() operator are appropriate). This takes precedence over what is given on the tokenspec decorator. """ if len(comment_marker) != 1: raise ValueError("The comment marker is expected to only be one character.") self.comment_marker = comment_marker self.sentence_schema = sentence_schema self.token_parser = _compile.token_parser( token_schema, delimiter, collapse_delimiters, field_descriptors, extra_primitives )
[docs] def parse_token(self, buffer: str) -> T: """ Parse a buffer into a Token. Args: buffer: The string to parse into a Token. No newline splitting is done on the input. Returns: The buffer parsed into the underlying Token type. """ return self.token_parser(buffer)
[docs] def parse_sentence(self, buffer: str) -> S: """ Parse a single sentence from the buffer. If there is more than one sentence in the buffer an error is thrown. Args: buffer: The string to parse for a single sentence. Returns: The single sentence that was parsed out of the string. """ it = self.iter_from_string(buffer) sentence = next(it) stopped = False try: next(it) except StopIteration: stopped = True if not stopped: raise RuntimeError( "Expected only a single sentence from the buffer, but more than one was found." ) return sentence
[docs] def load_from_string(self, source: str) -> list[S]: """ Parse a CoNLL formatted string into a list of sentences. Args: source: The CoNLL formatted string. Returns: A list of Sentence objects parsed from the source. Raises: ParseError: If there is an error parsing the input. """ return list(self.iter_from_string(source))
[docs] def load_from_file(self, filepath: PathLike) -> list[S]: """ Parse a CoNLL file into a list of sentences. Assumes the file is UTF-8 encoded. Args: filepath: The path descriptor of the file to parse. Returns: A list of Sentence objects parsed from the file. Raises: IOError: If there is an error opening the given filename. ParseError: If there is an error parsing the input. """ return list(self.iter_from_file(filepath))
[docs] def load_from_resource(self, resource: io.TextIOBase) -> list[S]: """ Parse a CoNLL resource into a list of sentences. Args: resource: The resource from which to read in the strings from. The resource must have universal newline reading enabled. Returns: A list of Sentence objects parsed from the resource. Raises: ParseError: If there is an error parsing the input. """ return list(self.iter_from_resource(resource))
[docs] def iter_from_string(self, source: str) -> Iterator[S]: """ Iterate over the Sentences contained within the string. Args: source: The source string to extract the Sentence iterator from. Returns: The sentence iterator. Raises: ParseError: If there is an error parsing the input. """ yield from self.iter_from_resource(io.StringIO(source))
[docs] def iter_from_file(self, filepath: PathLike) -> Iterator[S]: """ Iterate over the Sentence contained within the file. Assumes that the file is UTF-8 encoded. Args: filepath: The path descriptor of the file to parse. Returns: The sentence iterator. Raises: IOError: If there is an error opening the given filename. ParseError: If there is an error parsing the input. """ with open(filepath, encoding="utf-8") as f: yield from self.iter_from_resource(f)
[docs] def iter_from_resource(self, resource: io.TextIOBase) -> Iterator[S]: """ Iterate over the Sentences contained within the resource. Args: resource: The resource from which to read in the strings from. The resource must have universal newline reading enabled. Returns: An iterator over the parsed Sentences within the resource. Raises: ParseError: If there is an error parsing the input. """ sentence: S = self.sentence_schema() empty = True token_line_seen = False sentence_seen = False def step_next_sentence(): nonlocal sentence, empty, token_line_seen, sentence_seen sentence.__finalize__() old = sentence sentence = self.sentence_schema() empty = True token_line_seen = False sentence_seen = True return old comment_len = len(self.comment_marker) i = 1 while line := resource.readline(): line_num = i i += 1 if line.isspace(): if not empty: yield step_next_sentence() continue empty = False if line[0] == self.comment_marker: if token_line_seen: raise ParseError( f"Comment on line number {line_num} is coming after a non-comment line " "has already been seen." ) equal_sep = line.find("=", 1) if equal_sep < 0: key = _pair_down_whitespace(line, comment_len) if key is not None: sentence.__accept_meta__(key, None) else: key = _pair_down_whitespace(line, comment_len, equal_sep) or "" value = _pair_down_whitespace(line, equal_sep + 1) or "" sentence.__accept_meta__(key, value) else: token_line_seen = True try: token = self.token_parser(line) sentence.__accept_token__(token) except ParseError as exc: raise ParseError( f"Error parsing token on line number {line_num} of the line source." ) from exc if not empty or not sentence_seen: yield step_next_sentence()
[docs] class WriteFormat[T]: """ A write-only interface for serializing CoNLL formatted data. This class wraps Serializer functionality and provides methods to serialize CoNLL data to various output formats including strings and IO resources. Use this when only serialization operations are needed. """
[docs] def __init__( self, token_schema: type[T], comment_marker: str = "#", delimiter: str = "\t", field_descriptors: Optional[dict[str, Optional[FieldDescriptor]]] = None, extra_primitives: Optional[set[type]] = None, ) -> None: """ Initialize the write format handler. Args: token_schema: The Token type to use for serialization. sentence_schema: The Sentence type to use for serialization. comment_marker: The prefix to use for comments or metadata. Defaults to '#'. delimiter: The delimiter between Token columns. Defaults to tab. field_descriptors: The descriptors for the fields on the schema as a mapping from the field name to the descriptor instance. For primitive types, use None as the descriptor. This takes precedence over anything on the type itself. extra_primitives: The set of types to consider as primitives (default construction and the str() operator are appropriate). This takes precedence over what is given on the tokenspec decorator. """ self.serializer = _compile.token_serializer( token_schema, delimiter, field_descriptors, extra_primitives ) self.comment_marker = comment_marker
[docs] def serialize_token(self, token: T) -> str: """ Serialize a token to a string representation. Args: token: The token to serialize. Returns: The serialized representation of the token. """ return self.serializer(token)
[docs] def serialize_sentence[S: AbstractSentence](self, sentence: S) -> str: """ Serialize a Sentence to a string representation. Args: sentence: The sentence to serialize. Returns: The serialized representation of the sentence. """ buffer = io.StringIO() self.write_sentence(sentence, buffer) return buffer.getvalue()
[docs] def write_sentence[S: AbstractSentence](self, sentence: S, writable: IO[str]) -> None: """ Write an individual sentence to an IO buffer. Note that the buffer always has a newline added at the end. Args: sentence: The sentence to write to the buffer. writable: The buffer to do the writing to. Raises: FormatError: If the serialization of a Token was unable to be performed. """ for meta in sentence.meta.items(): if meta[1] is not None: line = f"{self.comment_marker} {meta[0]} = {meta[1]}\n" else: line = f"{self.comment_marker} {meta[0]}\n" writable.write(line) for token in sentence.tokens: writable.write(self.serializer(token)) writable.write("\n")
[docs] def write_corpus[S: AbstractSentence](self, corpus: Iterable[S], writable: IO[str]) -> None: """ Write out the entire corpus to the IO buffer. Args: corpus: The sequence of sentences to write out. writable: The IO buffer to write the sentences to. Raises: FormatError: If the serialization of a Token was unable to be performed. """ for sentence in corpus: self.write_sentence(sentence, writable) writable.write("\n")
[docs] class Format[T, S: AbstractSentence](ReadFormat[T, S], WriteFormat[T]): """ A unified interface for both parsing and serializing CoNLL formatted data. This class combines the functionality of ReadFormat and WriteFormat through multiple inheritance, providing a complete read/write interface for CoNLL data. It maintains consistent formatting options (comment markers, delimiters) across both parsing and serialization operations. For typical use cases where both reading and writing are needed, this is the recommended class to use. """
[docs] def __init__( self, token_schema: type[T], sentence_schema: type[S], comment_marker: str = "#", delimiter: str = "\t", collapse_delimiters: bool = False, field_descriptors: Optional[dict[str, Optional[FieldDescriptor]]] = None, extra_primitives: Optional[set[type]] = None, ) -> None: """ Initialize the format handler with both read and write capabilities. Args: token_schema: The Token type to use for parsing and serialization. sentence_schema: The Sentence type to use for parsing and serialization. comment_marker: The character that marks the beginning of comments. Defaults to '#'. delimiter: The delimiter between the columns on a token line. Defaults to tab. collapse_delimiters: Flag if sequential delimiters denote an empty value or should be collapsed into one larger delimiter. Defaults to False. field_descriptors: The descriptors for the fields on the schema as a mapping from the field name to the descriptor instance. For primitive types, use None as the descriptor. This takes precedence over anything on the type itself. extra_primitives: The set of types to consider as primitives (default construction and the str() operator are appropriate). This takes precedence over what is given on the tokenspec decorator. """ ReadFormat.__init__( self, token_schema, sentence_schema, comment_marker, delimiter, collapse_delimiters, field_descriptors, extra_primitives, ) WriteFormat.__init__( self, token_schema, comment_marker, delimiter, field_descriptors, extra_primitives, )