util

Modules¶

inspect ¶

Attributes¶

FieldType `module-attribute` ¶

FieldType: TypeAlias = Union[Field, Attribute]

TypeAlias for dataclass Fields or attrs Attributes. It will correspond to the correct type for the corresponding _DataclassesOrAttrClass

Functions¶

attr_from ¶

attr_from(cls: Type[_AttrFromType], kwargs: Dict[str, str], parsers: Optional[Dict[type, Callable[[str], Any]]] = None) -> _AttrFromType

Builds an attr or dataclasses class from key-word arguments

Parameters:

Name	Type	Description	Default
`cls`	`Type[_AttrFromType]`	the attr or dataclasses class to be built	required
`kwargs`	`Dict[str, str]`	a dictionary of keyword arguments	required
`parsers`	`Optional[Dict[type, Callable[[str], Any]]]`	a dictionary of parser functions to apply to specific types	`None`

Source code in fgpyo/util/inspect.py

def attr_from(
    cls: Type[_AttrFromType],
    kwargs: Dict[str, str],
    parsers: Optional[Dict[type, Callable[[str], Any]]] = None,
) -> _AttrFromType:
    """Builds an attr or dataclasses class from key-word arguments

    Args:
        cls: the attr or dataclasses class to be built
        kwargs: a dictionary of keyword arguments
        parsers: a dictionary of parser functions to apply to specific types

    """
    return_values: Dict[str, Any] = {}
    for attribute in get_fields(cls):  # type: ignore[arg-type]
        return_value: Any
        if attribute.name in kwargs:
            str_value: str = kwargs[attribute.name]
            set_value: bool = False

            # Use the converter if provided
            converter = getattr(attribute, "converter", None)
            if converter is not None:
                return_value = converter(str_value)
                set_value = True

            # try getting a known parser
            if not set_value:
                try:
                    parser = _get_parser(cls=cls, type_=attribute.type, parsers=parsers)
                    return_value = parser(str_value)
                    set_value = True
                except ParserNotFoundException:
                    pass

            # try setting by casting
            # Note that while bools *can* be cast from string, all non-empty strings evaluate to
            # True, because python, so we need to check for that explicitly
            if not set_value and attribute.type is not None and attribute.type is not bool:
                try:
                    return_value = attribute.type(str_value)  # type: ignore[operator]
                    set_value = True
                except (ValueError, TypeError):
                    pass

            # fail otherwise
            assert set_value, (
                f"Do not know how to convert string to {attribute.type} for value: {str_value}"
            )
        else:  # no value, check for a default
            assert attribute.default is not None or _attribute_is_optional(attribute), (
                f"No value given and no default for attribute `{attribute.name}`"
            )
            return_value = attribute.default
            # when the default is attr.NOTHING, just use None
            if return_value in MISSING:
                return_value = None

        return_values[attribute.name] = return_value

    return cls(**return_values)

dict_parser ¶

dict_parser(cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None) -> partial

Returns a function that parses a stringified dict into a Dict of the correct type.

Parameters:

Name	Type	Description	Default
`cls`	`Type`	the type of the class object this is being parsed for (used to get default val for parsers)	required
`type_`	`TypeAlias`	the type of the attribute to be parsed parsers: an optional mapping from type to the function to use for parsing that type (allows for parsing of more complex types)	required

Source code in fgpyo/util/inspect.py

def dict_parser(
    cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None
) -> partial:
    """
    Returns a function that parses a stringified dict into a `Dict` of the correct type.

    Args:
        cls: the type of the class object this is being parsed for (used to get default val for
            parsers)
        type_: the type of the attribute to be parsed
            parsers: an optional mapping from type to the function to use for parsing that type
            (allows for parsing of more complex types)
    """
    subtypes = typing.get_args(type_)
    assert len(subtypes) == 2, "Dict object must have exactly 2 subtypes per PEP specification!"
    (key_parser, val_parser) = (
        _get_parser(
            cls,
            subtypes[0],
            parsers,
        ),
        _get_parser(
            cls,
            subtypes[1],
            parsers,
        ),
    )

    def dict_parse(dict_string: str) -> Dict[Any, Any]:
        """
        Parses a dictionary value (can do so recursively)
        """
        assert dict_string[0] == "{", "Dict val improperly formatted"
        assert dict_string[-1] == "}", "Dict val improprly formatted"
        dict_string = dict_string[1:-1]
        if len(dict_string) == 0:
            return {}
        else:
            outer_splits = split_at_given_level(dict_string, split_delim=",")
            out_dict = {}
            for outer_split in outer_splits:
                inner_splits = split_at_given_level(outer_split, split_delim=";")
                assert len(inner_splits) % 2 == 0, (
                    "Inner splits of dict didn't have matched key val pairs"
                )
                for i in range(0, len(inner_splits), 2):
                    key = key_parser(inner_splits[i])
                    if key in out_dict:
                        raise ValueError("Duplicate key found in dict: {}".format(key))
                    out_dict[key] = val_parser(inner_splits[i + 1])
            return out_dict

    return functools.partial(dict_parse)

get_fields ¶

get_fields(cls: Union[_DataclassesOrAttrClass, Type[_DataclassesOrAttrClass]]) -> Tuple[FieldType, ...]

Get the fields tuple from either a dataclasses or attr dataclass (or instance)

Source code in fgpyo/util/inspect.py

def get_fields(
    cls: Union[_DataclassesOrAttrClass, Type[_DataclassesOrAttrClass]],
) -> Tuple[FieldType, ...]:
    """Get the fields tuple from either a dataclasses or attr dataclass (or instance)"""
    if is_dataclasses_class(cls):
        return get_dataclasses_fields(cls)
    elif is_attr_class(cls):  # type: ignore[arg-type]
        return get_attr_fields(cls)  # type: ignore[arg-type, no-any-return]
    else:
        raise TypeError("cls must a dataclasses or attr class")

get_fields_dict ¶

get_fields_dict(cls: Union[_DataclassesOrAttrClass, Type[_DataclassesOrAttrClass]]) -> Mapping[str, FieldType]

Get the fields dict from either a dataclasses or attr dataclass (or instance)

Source code in fgpyo/util/inspect.py

def get_fields_dict(
    cls: Union[_DataclassesOrAttrClass, Type[_DataclassesOrAttrClass]],
) -> Mapping[str, FieldType]:
    """Get the fields dict from either a dataclasses or attr dataclass (or instance)"""
    if is_dataclasses_class(cls):
        return _get_dataclasses_fields_dict(cls)
    elif is_attr_class(cls):  # type: ignore[arg-type]
        return get_attr_fields_dict(cls)  # type: ignore[arg-type]
    else:
        raise TypeError("cls must a dataclasses or attr class")

is_attr_class ¶

is_attr_class(cls: type) -> bool

Return True if the class is an attr class, and False otherwise

Source code in fgpyo/util/inspect.py

def is_attr_class(cls: type) -> bool:
    """Return True if the class is an attr class, and False otherwise"""
    return hasattr(cls, "__attrs_attrs__")

list_parser ¶

list_parser(cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None) -> partial

Returns a function that parses a "stringified" list into a List of the correct type.

Parameters:

Name	Type	Description	Default
`cls`	`Type`	the type of the class object this is being parsed for (used to get default val for parsers)	required
`type_`	`TypeAlias`	the type of the attribute to be parsed	required
`parsers`	`Optional[Dict[type, Callable[[str], Any]]]`	an optional mapping from type to the function to use for parsing that type (allows for parsing of more complex types)	`None`

Source code in fgpyo/util/inspect.py

def list_parser(
    cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None
) -> partial:
    """
    Returns a function that parses a "stringified" list into a `List` of the correct type.

    Args:
        cls: the type of the class object this is being parsed for (used to get default val for
            parsers)
        type_: the type of the attribute to be parsed
        parsers: an optional mapping from type to the function to use for parsing that type (allows
            for parsing of more complex types)
    """
    subtypes = typing.get_args(type_)
    assert len(subtypes) == 1, "Lists are allowed only one subtype per PEP specification!"
    subtype_parser = _get_parser(
        cls,
        subtypes[0],
        parsers,
    )
    return functools.partial(
        lambda s: list(
            []
            if s == ""
            else [subtype_parser(item) for item in list(split_at_given_level(s, split_delim=","))]
        )
    )

set_parser ¶

set_parser(cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None) -> partial

Returns a function that parses a stringified set into a Set of the correct type.

Parameters:

Name	Type	Description	Default
`cls`	`Type`	the type of the class object this is being parsed for (used to get default val for parsers)	required
`type_`	`TypeAlias`	the type of the attribute to be parsed	required
`parsers`	`Optional[Dict[type, Callable[[str], Any]]]`	an optional mapping from type to the function to use for parsing that type (allows for parsing of more complex types)	`None`

Source code in fgpyo/util/inspect.py

def set_parser(
    cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None
) -> partial:
    """
    Returns a function that parses a stringified set into a `Set` of the correct type.

    Args:
        cls: the type of the class object this is being parsed for (used to get default val for
            parsers)
        type_: the type of the attribute to be parsed
        parsers: an optional mapping from type to the function to use for parsing that type (allows
            for parsing of more complex types)
    """
    subtypes = typing.get_args(type_)
    assert len(subtypes) == 1, "Sets are allowed only one subtype per PEP specification!"
    subtype_parser = _get_parser(
        cls,
        subtypes[0],
        parsers,
    )
    return functools.partial(
        lambda s: set(
            set({})
            if s == "{}"
            else [
                subtype_parser(item) for item in set(split_at_given_level(s[1:-1], split_delim=","))
            ]
        )
    )

split_at_given_level ¶

split_at_given_level(field: str, split_delim: str = ',', increase_depth_chars: Iterable[str] = ('{', '(', '['), decrease_depth_chars: Iterable[str] = ('}', ')', ']')) -> List[str]

Splits a nested field by its outer-most level

Note that this method may produce incorrect results fields containing strings containing unpaired characters that increase or decrease the depth

Not currently smart enough to deal with fields enclosed in quotes ('' or "") - TODO

Source code in fgpyo/util/inspect.py

def split_at_given_level(
    field: str,
    split_delim: str = ",",
    increase_depth_chars: Iterable[str] = ("{", "(", "["),
    decrease_depth_chars: Iterable[str] = ("}", ")", "]"),
) -> List[str]:
    """
    Splits a nested field by its outer-most level

    Note that this method may produce incorrect results fields containing strings containing
    unpaired characters that increase or decrease the depth

    Not currently smart enough to deal with fields enclosed in quotes ('' or "") - TODO
    """

    outer_depth_of_split = 0
    current_outer_splits = []
    out_vals: List[str] = []
    for high_level_split in field.split(split_delim):
        increase_in_depth = 0
        for char in increase_depth_chars:
            increase_in_depth += high_level_split.count(char)

        decrease_in_depth = 0
        for char in decrease_depth_chars:
            decrease_in_depth += high_level_split.count(char)
        outer_depth_of_split += increase_in_depth - decrease_in_depth

        assert outer_depth_of_split >= 0, "Unpaired depth character! Likely incorrect output"

        current_outer_splits.append(high_level_split)
        if outer_depth_of_split == 0:
            out_vals.append(split_delim.join(current_outer_splits))
            current_outer_splits = []
    assert outer_depth_of_split == 0, "Unpaired depth character! Likely incorrect output!"
    return out_vals

tuple_parser ¶

tuple_parser(cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None) -> partial

Returns a function that parses a stringified tuple into a Tuple of the correct type.

Parameters:

Name	Type	Description	Default
`cls`	`Type`	the type of the class object this is being parsed for (used to get default val for parsers)	required
`type_`	`TypeAlias`	the type of the attribute to be parsed	required
`parsers`	`Optional[Dict[type, Callable[[str], Any]]]`	an optional mapping from type to the function to use for parsing that type (allows for parsing of more complex types)	`None`

Source code in fgpyo/util/inspect.py

def tuple_parser(
    cls: Type, type_: TypeAlias, parsers: Optional[Dict[type, Callable[[str], Any]]] = None
) -> partial:
    """
    Returns a function that parses a stringified tuple into a `Tuple` of the correct type.

    Args:
        cls: the type of the class object this is being parsed for (used to get default val for
            parsers)
        type_: the type of the attribute to be parsed
        parsers: an optional mapping from type to the function to use for parsing that type (allows
            for parsing of more complex types)
    """
    subtype_parsers = [
        _get_parser(
            cls,
            subtype,
            parsers,
        )
        for subtype in typing.get_args(type_)
    ]

    def tuple_parse(tuple_string: str) -> Tuple[Any, ...]:
        """
        Parses a dictionary value (can do so recursively)
        Note that this tool will fail on tuples containing strings containing
        unpaired '{', or '}' characters
        """
        assert tuple_string[0] == "(", "Tuple val improperly formatted"
        assert tuple_string[-1] == ")", "Tuple val improperly formatted"
        tuple_string = tuple_string[1:-1]
        if len(tuple_string) == 0:
            return ()
        else:
            val_strings = split_at_given_level(tuple_string, split_delim=",")
            return tuple(parser(val_str) for parser, val_str in zip(subtype_parsers, val_strings))

    return functools.partial(tuple_parse)

Modules¶

logging ¶

Methods for setting up logging for tools.¶

Progress Logging Examples¶

Frequently input data (SAM/BAM/CRAM/VCF) are iterated in genomic coordinate order. Logging progress is useful to not only log how many inputs have been consumed, but also their genomic coordinate. ProgressLogger() can log progress every fixed number of records. Logging can be written to logging.Logger as well as custom print method.

>>> from fgpyo.util.logging import ProgressLogger
>>> logged_lines = []
>>> progress = ProgressLogger(
...     printer=lambda s: logged_lines.append(s),
...     verb="recorded",
...     noun="items",
...     unit=2
... )
>>> progress.record(reference_name="chr1", position=1)  # does not log
False
>>> progress.record(reference_name="chr1", position=2)  # logs
True
>>> progress.record(reference_name="chr1", position=3)  # does not log
False
>>> progress.log_last()  # will log the last recorded item, if not previously logged
True
>>> logged_lines  # show the lines logged
['recorded 2 items: chr1:2', 'recorded 3 items: chr1:3']

Classes¶

ProgressLogger ¶

Bases: AbstractContextManager

A little class to track progress.

This will output a log message every unit number times recorded.

Attributes:

Name	Type	Description
`printer`	`Callable[[str], Any]`	either a Logger (in which case progress will be printed at Info) or a lambda that consumes a single string
`noun`	`str`	the noun to use in the log message
`verb`	`str`	the verb to use in the log message
`unit`	`int`	the number of items for every log message
`count`	`int`	the total count of items recorded

Source code in fgpyo/util/logging.py

class ProgressLogger(AbstractContextManager):
    """A little class to track progress.

    This will output a log message every `unit` number times recorded.

    Attributes:
        printer: either a Logger (in which case progress will be printed at Info) or a lambda
            that consumes a single string
        noun: the noun to use in the log message
        verb: the verb to use in the log message
        unit: the number of items for every log message
        count: the total count of items recorded
    """

    def __init__(
        self,
        printer: Union[Logger, Callable[[str], Any]],
        noun: str = "records",
        verb: str = "Read",
        unit: int = 100000,
    ) -> None:
        self.printer: Callable[[str], Any]
        if isinstance(printer, Logger):
            self.printer = lambda s: printer.info(s)
        else:
            self.printer = printer
        self.noun: str = noun
        self.verb: str = verb
        self.unit: int = unit
        self.count: int = 0
        self._count_mod_unit: int = 0
        self._last_reference_name: Optional[str] = None
        self._last_position: Optional[int] = None

    def __exit__(
        self, ex_type: Optional[Any], ex_value: Optional[Any], traceback: Optional[Any]
    ) -> Literal[False]:
        if ex_value is None:
            self.log_last()
        return False

    def record(
        self,
        reference_name: Optional[str] = None,
        position: Optional[int] = None,
    ) -> bool:
        """Record an item at a given genomic coordinate.
        Args:
            reference_name: the reference name of the item
            position: the 1-based start position of the item
        Returns:
            true if a message was logged, false otherwise
        """
        self.count += 1
        self._count_mod_unit += 1
        self._last_reference_name = reference_name
        self._last_position = None if position is None or position <= 0 else position
        if self._count_mod_unit == self.unit:
            self._count_mod_unit = 0
            self._log(refname=self._last_reference_name, position=self._last_position)
            return True
        else:
            return False

    def record_alignment(
        self,
        rec: AlignedSegment,
    ) -> bool:
        """Correctly record pysam.AlignedSegments (zero-based coordinates).

        Args:
            rec: pysam.AlignedSegment object

        Returns:
            true if a message was logged, false otherwise
        """
        if rec.reference_start is None:
            return self.record(None, None)
        else:
            return self.record(rec.reference_name, rec.reference_start + 1)

    def record_alignments(
        self,
        recs: Iterable[AlignedSegment],
    ) -> bool:
        """Correctly record multiple pysam.AlignedSegments (zero-based coordinates).

        Args:
            recs: pysam.AlignedSegment objects

        Returns:
            true if a message was logged, false otherwise
        """
        logged_message: bool = False
        for rec in recs:
            logged_message = self.record_alignment(rec) or logged_message
        return logged_message

    def _log(
        self,
        refname: Optional[str] = None,
        position: Optional[int] = None,
    ) -> None:
        """Helper method to print the log message.

        Args:
            refname: the name of the reference of the item
            position: the 1-based start position of the item

        Returns:
            None
        """
        coordinate: str
        if refname is None and position is None:
            coordinate = "NA"
        else:
            assert refname is not None and position is not None, f"{refname} {position}"
            coordinate = f"{refname}:{position:,d}"

        self.printer(f"{self.verb} {self.count:,d} {self.noun}: {coordinate}")

        return None

    def log_last(
        self,
    ) -> bool:
        """Force logging the last record, for example when progress has completed."""
        if self._count_mod_unit != 0:
            self._log(refname=self._last_reference_name, position=self._last_position)
            return True
        else:
            return False

Functions¶

log_last ¶

log_last() -> bool

Force logging the last record, for example when progress has completed.

Source code in fgpyo/util/logging.py

def log_last(
    self,
) -> bool:
    """Force logging the last record, for example when progress has completed."""
    if self._count_mod_unit != 0:
        self._log(refname=self._last_reference_name, position=self._last_position)
        return True
    else:
        return False

record ¶

record(reference_name: Optional[str] = None, position: Optional[int] = None) -> bool

Record an item at a given genomic coordinate. Args: reference_name: the reference name of the item position: the 1-based start position of the item Returns: true if a message was logged, false otherwise

Source code in fgpyo/util/logging.py

def record(
    self,
    reference_name: Optional[str] = None,
    position: Optional[int] = None,
) -> bool:
    """Record an item at a given genomic coordinate.
    Args:
        reference_name: the reference name of the item
        position: the 1-based start position of the item
    Returns:
        true if a message was logged, false otherwise
    """
    self.count += 1
    self._count_mod_unit += 1
    self._last_reference_name = reference_name
    self._last_position = None if position is None or position <= 0 else position
    if self._count_mod_unit == self.unit:
        self._count_mod_unit = 0
        self._log(refname=self._last_reference_name, position=self._last_position)
        return True
    else:
        return False

record_alignment ¶

record_alignment(rec: AlignedSegment) -> bool

Correctly record pysam.AlignedSegments (zero-based coordinates).

Parameters:

Name	Type	Description	Default
`rec`	`AlignedSegment`	pysam.AlignedSegment object	required

Returns:

Type	Description
`bool`	true if a message was logged, false otherwise

Source code in fgpyo/util/logging.py

def record_alignment(
    self,
    rec: AlignedSegment,
) -> bool:
    """Correctly record pysam.AlignedSegments (zero-based coordinates).

    Args:
        rec: pysam.AlignedSegment object

    Returns:
        true if a message was logged, false otherwise
    """
    if rec.reference_start is None:
        return self.record(None, None)
    else:
        return self.record(rec.reference_name, rec.reference_start + 1)

record_alignments ¶

record_alignments(recs: Iterable[AlignedSegment]) -> bool

Correctly record multiple pysam.AlignedSegments (zero-based coordinates).

Parameters:

Name	Type	Description	Default
`recs`	`Iterable[AlignedSegment]`	pysam.AlignedSegment objects	required

Returns:

Type	Description
`bool`	true if a message was logged, false otherwise

Source code in fgpyo/util/logging.py

def record_alignments(
    self,
    recs: Iterable[AlignedSegment],
) -> bool:
    """Correctly record multiple pysam.AlignedSegments (zero-based coordinates).

    Args:
        recs: pysam.AlignedSegment objects

    Returns:
        true if a message was logged, false otherwise
    """
    logged_message: bool = False
    for rec in recs:
        logged_message = self.record_alignment(rec) or logged_message
    return logged_message

Functions¶

setup_logging ¶

setup_logging(level: str = 'INFO', name: str = 'fgpyo') -> None

Globally configure logging for all modules

Configures logging to run at a specific level and output messages to stderr with useful information preceding the actual log message.

Parameters:

Name	Type	Description	Default
`level`	`str`	the default level for the logger	`'INFO'`
`name`	`str`	the name of the logger	`'fgpyo'`

Source code in fgpyo/util/logging.py

def setup_logging(level: str = "INFO", name: str = "fgpyo") -> None:
    """Globally configure logging for all modules

    Configures logging to run at a specific level and output messages to stderr with
    useful information preceding the actual log message.

    Args:
        level: the default level for the logger
        name: the name of the logger
    """
    global __FGPYO_LOGGING_SETUP

    with __LOCK:
        if not __FGPYO_LOGGING_SETUP:
            format = (
                f"%(asctime)s {socket.gethostname()} %(name)s:%(funcName)s:%(lineno)s "
                + "[%(levelname)s]: %(message)s"
            )
            handler = logging.StreamHandler()
            handler.setLevel(level)
            handler.setFormatter(logging.Formatter(format))

            logger = logging.getLogger(name)
            logger.setLevel(level)
            logger.addHandler(handler)
        else:
            logging.getLogger(__name__).warn("Logging already initialized.")

        __FGPYO_LOGGING_SETUP = True

metric ¶

Metrics¶

Module for storing, reading, and writing metric-like tab-delimited information.

Metric files are tab-delimited, contain a header, and zero or more rows for metric values. This makes it easy for them to be read in languages like R. For example, a row per person, with columns for age, gender, and address.

The Metric() class makes it easy to read, write, and store one or metrics of the same type, all the while preserving types for each value in a metric. It is an abstract base class decorated by @dataclass, or @attr.s, with attributes storing one or more typed values. If using multiple layers of inheritance, keep in mind that it's not possible to mix these dataclass utils, e.g. a dataclasses class derived from an attr class will not appropriately initialize the values of the attr superclass.

Examples¶

Defining a new metric class:

>>> from fgpyo.util.metric import Metric
>>> import dataclasses
>>> @dataclasses.dataclass(frozen=True)
... class Person(Metric["Person"]):
...     name: str
...     age: int

or using attr:

>>> from fgpyo.util.metric import Metric
>>> import attr
>>> from typing import Optional
>>> @attr.s(auto_attribs=True, frozen=True)
... class PersonAttr(Metric["PersonAttr"]):
...     name: str
...     age: int
...     address: Optional[str] = None

Getting the attributes for a metric class. These will be used for the header when reading and writing metric files.

>>> Person.header()
['name', 'age']

Getting the values from a metric class instance. The values are in the same order as the header.

>>> list(Person(name="Alice", age=47).values())
['Alice', 47]

Writing a list of metrics to a file:

>>> metrics = [
...     Person(name="Alice", age=47),
...     Person(name="Bob", age=24)
... ]
>>> from pathlib import Path
>>> Person.write(Path("/path/to/metrics.txt"), *metrics)

Then the contents of the written metrics file:

$ column -t /path/to/metrics.txt
name   age
Alice  47
Bob    24

Reading the metrics file back in:

>>> list(Person.read(Path("/path/to/metrics.txt")))  
[Person(name='Alice', age=47), Person(name='Bob', age=24)]

Formatting and parsing the values for custom types is supported by overriding the _parsers() and format_value() methods.

>>> @dataclasses.dataclass(frozen=True)
... class Name:
...     first: str
...     last: str
...     @classmethod
...     def parse(cls, value: str) -> "Name":
...          fields = value.split(" ")
...          return Name(first=fields[0], last=fields[1])
>>> from typing import Dict, Callable, Any
>>> @dataclasses.dataclass(frozen=True)
... class PersonWithName(Metric["PersonWithName"]):
...     name: Name
...     age: int
...     @classmethod
...     def _parsers(cls) -> Dict[type, Callable[[str], Any]]:
...         return {Name: lambda value: Name.parse(value=value)}
...     @classmethod
...     def format_value(cls, value: Any) -> str:
...         if isinstance(value, Name):
...             return f"{value.first} {value.last}"
...         else:
...             return super().format_value(value=value)
>>> PersonWithName.parse(fields=["john doe", "42"])
PersonWithName(name=Name(first='john', last='doe'), age=42)
>>> PersonWithName(name=Name(first='john', last='doe'), age=42).formatted_values()
['john doe', '42']

Classes¶

Metric ¶

Bases: ABC, Generic[MetricType]

Abstract base class for all metric-like tab-delimited files

Metric files are tab-delimited, contain a header, and zero or more rows for metric values. This makes it easy for them to be read in languages like R.

Subclasses of Metric() can support parsing and formatting custom types with _parsers() and format_value().

Source code in fgpyo/util/metric.py

class Metric(ABC, Generic[MetricType]):
    """Abstract base class for all metric-like tab-delimited files

    Metric files are tab-delimited, contain a header, and zero or more rows for metric values.  This
    makes it easy for them to be read in languages like `R`.

    Subclasses of [`Metric()`][fgpyo.util.metric.Metric] can support parsing and
    formatting custom types with `_parsers()` and
    [`format_value()`][fgpyo.util.metric.Metric.format_value].
    """

    @classmethod
    def keys(cls) -> Iterator[str]:
        """An iterator over field names in the same order as the header."""
        for field in inspect.get_fields(cls):  # type: ignore[arg-type]
            yield field.name

    def values(self) -> Iterator[Any]:
        """An iterator over attribute values in the same order as the header."""
        for field in inspect.get_fields(self.__class__):  # type: ignore[arg-type]
            yield getattr(self, field.name)

    def items(self) -> Iterator[Tuple[str, Any]]:
        """
        An iterator over field names and their corresponding values in the same order as the header.
        """
        for field in inspect.get_fields(self.__class__):  # type: ignore[arg-type]
            yield (field.name, getattr(self, field.name))

    def formatted_values(self) -> List[str]:
        """An iterator over formatted attribute values in the same order as the header."""
        return [self.format_value(value) for value in self.values()]

    def formatted_items(self) -> List[Tuple[str, str]]:
        """An iterator over formatted attribute values in the same order as the header."""
        return [(key, self.format_value(value)) for key, value in self.items()]

    @classmethod
    def _parsers(cls) -> Dict[type, Callable[[str], Any]]:
        """Mapping of type to a specific parser for that type.  The parser must accept a string
        as a single parameter and return a single value of the given type.  Sub-classes may
        override this method to support custom types."""
        return {}

    @classmethod
    def read(
        cls,
        path: Path,
        ignore_extra_fields: bool = True,
        strip_whitespace: bool = False,
        threads: Optional[int] = None,
    ) -> Iterator[Any]:
        """Reads in zero or more metrics from the given path.

        The metric file must contain a matching header.

        Columns that are not present in the file but are optional in the metric class will
        be default values.

        Args:
            path: the path to the metrics file.
            ignore_extra_fields: True to ignore any extra columns, False to raise an exception.
            strip_whitespace: True to strip leading and trailing whitespace from each field,
                               False to keep as-is.
            threads: the number of threads to use when decompressing gzip files
        """
        parsers = cls._parsers()
        with io.to_reader(path, threads=threads) as reader:
            header: List[str] = reader.readline().rstrip("\r\n").split("\t")
            # check the header
            class_fields = set(cls.header())
            file_fields = set(header)
            missing_from_class = file_fields.difference(class_fields)
            missing_from_file = class_fields.difference(file_fields)

            field_name_to_attribute = inspect.get_fields_dict(cls)  # type: ignore[arg-type]

            # ignore class fields that are missing from the file (via header) if they're optional
            # or have a default
            if len(missing_from_file) > 0:
                fields_with_defaults = [
                    field
                    for field in missing_from_file
                    if inspect._attribute_has_default(field_name_to_attribute[field])
                ]
                # remove optional class fields from the fields
                missing_from_file = missing_from_file.difference(fields_with_defaults)

            # raise an exception if there are non-optional class fields missing from the file
            if len(missing_from_file) > 0:
                raise ValueError(
                    f"In file: {path}, fields in file missing from class '{cls.__name__}': "
                    + ", ".join(missing_from_file)
                )

            # raise an exception if there are fields in the file not in the header, unless they
            # should be ignored.
            if not ignore_extra_fields and len(missing_from_class) > 0:
                raise ValueError(
                    f"In file: {path}, extra fields in file missing from class '{cls.__name__}': "
                    ", ".join(missing_from_file)
                )

            # read the metric lines
            for lineno, line in enumerate(reader, 2):
                # parse the raw values
                values: List[str] = line.rstrip("\r\n").split("\t")
                if strip_whitespace:
                    values = [v.strip() for v in values]

                # raise an exception if there aren't the same number of values as the header
                if len(header) != len(values):
                    raise ValueError(
                        f"In file: {path}, expected {len(header)} columns, got {len(values)} on "
                        f"line {lineno}: {line}"
                    )

                # build the metric
                instance: Metric[MetricType] = inspect.attr_from(
                    cls=cls, kwargs=dict(zip(header, values)), parsers=parsers
                )
                yield instance

    @classmethod
    def parse(cls, fields: List[str]) -> Any:
        """Parses the string-representation of this metric.  One string per attribute should be
        given.

        """
        parsers = cls._parsers()
        header = cls.header()
        assert len(fields) == len(header)
        return inspect.attr_from(cls=cls, kwargs=dict(zip(header, fields)), parsers=parsers)

    @classmethod
    def write(cls, path: Path, *values: MetricType, threads: Optional[int] = None) -> None:
        """Writes zero or more metrics to the given path.

        The header will always be written.

        Args:
            path: Path to the output file.
            values: Zero or more metrics.
            threads: the number of threads to use when compressing gzip files

        """
        with MetricWriter[MetricType](path, metric_class=cls, threads=threads) as writer:
            writer.writeall(values)

    @classmethod
    def header(cls) -> List[str]:
        """The list of header values for the metric."""
        return [a.name for a in inspect.get_fields(cls)]  # type: ignore[arg-type]

    @classmethod
    def format_value(cls, value: Any) -> str:  # noqa: C901
        """The default method to format values of a given type.

        By default, this method will comma-delimit `list`, `tuple`, and `set` types, and apply
        `str` to all others.

        Dictionaries / mappings will have keys and vals separated by semicolons, and key val pairs
        delimited by commas.

        In addition, lists will be flanked with '[]', tuples with '()' and sets and dictionaries
        with '{}'

        Args:
            value: the value to format.
        """
        if issubclass(type(value), Enum):
            return cls.format_value(value.value)
        if isinstance(value, (tuple)):
            if len(value) == 0:
                return "()"
            else:
                return "(" + ",".join(cls.format_value(v) for v in value) + ")"
        if isinstance(value, (list)):
            if len(value) == 0:
                return ""
            else:
                return ",".join(cls.format_value(v) for v in value)
        if isinstance(value, (set)):
            if len(value) == 0:
                return ""
            else:
                return "{" + ",".join(cls.format_value(v) for v in value) + "}"

        elif isinstance(value, dict):
            if len(value) == 0:
                return "{}"
            else:
                return (
                    "{"
                    + ",".join(
                        f"{cls.format_value(k)};{cls.format_value(v)}" for k, v in value.items()
                    )
                    + "}"
                )
        elif isinstance(value, float):
            return f"{round(value, 5)}"
        elif value is None:
            return ""
        else:
            return f"{value}"

    @classmethod
    def to_list(cls, value: str) -> List[Any]:
        """Returns a list value split on comma delimeter."""
        return [] if value == "" else value.split(",")

    @staticmethod
    def fast_concat(*inputs: Path, output: Path) -> None:
        if len(inputs) == 0:
            raise ValueError("No inputs provided")

        headers = [next(io.read_lines(input_path)) for input_path in inputs]
        assert len(set(headers)) == 1, "Input headers do not match"
        io.write_lines(path=output, lines_to_write=set(headers))

        for input_path in inputs:
            io.write_lines(
                path=output, lines_to_write=list(io.read_lines(input_path))[1:], append=True
            )

    @staticmethod
    def _read_header(
        reader: TextIOWrapper,
        delimiter: str = "\t",
        comment_prefix: str = "#",
    ) -> MetricFileHeader:
        """
        Read the header from an open file.

        The first row after any commented or empty lines will be used as the fieldnames.

        Lines preceding the fieldnames will be returned in the `preamble`. Leading and trailing
        whitespace are removed and ignored.

        Args:
            reader: An open, readable file handle.
            delimiter: The delimiter character used to separate fields in the file.
            comment_prefix: The prefix for comment lines in the file.

        Returns:
            A `MetricFileHeader` containing the field names and any preceding lines.

        Raises:
            ValueError: If the file was empty or contained only comments or empty lines.
        """

        preamble: List[str] = []

        for line in reader:
            if line.strip().startswith(comment_prefix) or line.strip() == "":
                # Skip any commented or empty lines before the header
                preamble.append(line.strip())
            else:
                # The first line with any other content is assumed to be the header
                fieldnames = line.strip().split(delimiter)
                break
        else:
            # If the file was empty, kick back an empty header
            fieldnames = []

        return MetricFileHeader(preamble=preamble, fieldnames=fieldnames)

Functions¶

format_value classmethod ¶

format_value(value: Any) -> str

The default method to format values of a given type.

By default, this method will comma-delimit list, tuple, and set types, and apply str to all others.

Dictionaries / mappings will have keys and vals separated by semicolons, and key val pairs delimited by commas.

In addition, lists will be flanked with '[]', tuples with '()' and sets and dictionaries with '{}'

Parameters:

Name	Type	Description	Default
`value`	`Any`	the value to format.	required

Source code in fgpyo/util/metric.py

@classmethod
def format_value(cls, value: Any) -> str:  # noqa: C901
    """The default method to format values of a given type.

    By default, this method will comma-delimit `list`, `tuple`, and `set` types, and apply
    `str` to all others.

    Dictionaries / mappings will have keys and vals separated by semicolons, and key val pairs
    delimited by commas.

    In addition, lists will be flanked with '[]', tuples with '()' and sets and dictionaries
    with '{}'

    Args:
        value: the value to format.
    """
    if issubclass(type(value), Enum):
        return cls.format_value(value.value)
    if isinstance(value, (tuple)):
        if len(value) == 0:
            return "()"
        else:
            return "(" + ",".join(cls.format_value(v) for v in value) + ")"
    if isinstance(value, (list)):
        if len(value) == 0:
            return ""
        else:
            return ",".join(cls.format_value(v) for v in value)
    if isinstance(value, (set)):
        if len(value) == 0:
            return ""
        else:
            return "{" + ",".join(cls.format_value(v) for v in value) + "}"

    elif isinstance(value, dict):
        if len(value) == 0:
            return "{}"
        else:
            return (
                "{"
                + ",".join(
                    f"{cls.format_value(k)};{cls.format_value(v)}" for k, v in value.items()
                )
                + "}"
            )
    elif isinstance(value, float):
        return f"{round(value, 5)}"
    elif value is None:
        return ""
    else:
        return f"{value}"

formatted_items ¶

formatted_items() -> List[Tuple[str, str]]

An iterator over formatted attribute values in the same order as the header.

Source code in fgpyo/util/metric.py

def formatted_items(self) -> List[Tuple[str, str]]:
    """An iterator over formatted attribute values in the same order as the header."""
    return [(key, self.format_value(value)) for key, value in self.items()]

formatted_values ¶

formatted_values() -> List[str]

An iterator over formatted attribute values in the same order as the header.

Source code in fgpyo/util/metric.py

def formatted_values(self) -> List[str]:
    """An iterator over formatted attribute values in the same order as the header."""
    return [self.format_value(value) for value in self.values()]

header() -> List[str]

The list of header values for the metric.

Source code in fgpyo/util/metric.py

@classmethod
def header(cls) -> List[str]:
    """The list of header values for the metric."""
    return [a.name for a in inspect.get_fields(cls)]  # type: ignore[arg-type]

items ¶

items() -> Iterator[Tuple[str, Any]]

An iterator over field names and their corresponding values in the same order as the header.

Source code in fgpyo/util/metric.py

def items(self) -> Iterator[Tuple[str, Any]]:
    """
    An iterator over field names and their corresponding values in the same order as the header.
    """
    for field in inspect.get_fields(self.__class__):  # type: ignore[arg-type]
        yield (field.name, getattr(self, field.name))

keys classmethod ¶

keys() -> Iterator[str]

An iterator over field names in the same order as the header.

Source code in fgpyo/util/metric.py

@classmethod
def keys(cls) -> Iterator[str]:
    """An iterator over field names in the same order as the header."""
    for field in inspect.get_fields(cls):  # type: ignore[arg-type]
        yield field.name

parse classmethod ¶

parse(fields: List[str]) -> Any

Parses the string-representation of this metric. One string per attribute should be given.

Source code in fgpyo/util/metric.py

@classmethod
def parse(cls, fields: List[str]) -> Any:
    """Parses the string-representation of this metric.  One string per attribute should be
    given.

    """
    parsers = cls._parsers()
    header = cls.header()
    assert len(fields) == len(header)
    return inspect.attr_from(cls=cls, kwargs=dict(zip(header, fields)), parsers=parsers)

read classmethod ¶

read(path: Path, ignore_extra_fields: bool = True, strip_whitespace: bool = False, threads: Optional[int] = None) -> Iterator[Any]

Reads in zero or more metrics from the given path.

The metric file must contain a matching header.

Columns that are not present in the file but are optional in the metric class will be default values.

Parameters:

Name	Type	Description	Default
`path`	`Path`	the path to the metrics file.	required
`ignore_extra_fields`	`bool`	True to ignore any extra columns, False to raise an exception.	`True`
`strip_whitespace`	`bool`	True to strip leading and trailing whitespace from each field, False to keep as-is.	`False`
`threads`	`Optional[int]`	the number of threads to use when decompressing gzip files	`None`

Source code in fgpyo/util/metric.py

@classmethod
def read(
    cls,
    path: Path,
    ignore_extra_fields: bool = True,
    strip_whitespace: bool = False,
    threads: Optional[int] = None,
) -> Iterator[Any]:
    """Reads in zero or more metrics from the given path.

    The metric file must contain a matching header.

    Columns that are not present in the file but are optional in the metric class will
    be default values.

    Args:
        path: the path to the metrics file.
        ignore_extra_fields: True to ignore any extra columns, False to raise an exception.
        strip_whitespace: True to strip leading and trailing whitespace from each field,
                           False to keep as-is.
        threads: the number of threads to use when decompressing gzip files
    """
    parsers = cls._parsers()
    with io.to_reader(path, threads=threads) as reader:
        header: List[str] = reader.readline().rstrip("\r\n").split("\t")
        # check the header
        class_fields = set(cls.header())
        file_fields = set(header)
        missing_from_class = file_fields.difference(class_fields)
        missing_from_file = class_fields.difference(file_fields)

        field_name_to_attribute = inspect.get_fields_dict(cls)  # type: ignore[arg-type]

        # ignore class fields that are missing from the file (via header) if they're optional
        # or have a default
        if len(missing_from_file) > 0:
            fields_with_defaults = [
                field
                for field in missing_from_file
                if inspect._attribute_has_default(field_name_to_attribute[field])
            ]
            # remove optional class fields from the fields
            missing_from_file = missing_from_file.difference(fields_with_defaults)

        # raise an exception if there are non-optional class fields missing from the file
        if len(missing_from_file) > 0:
            raise ValueError(
                f"In file: {path}, fields in file missing from class '{cls.__name__}': "
                + ", ".join(missing_from_file)
            )

        # raise an exception if there are fields in the file not in the header, unless they
        # should be ignored.
        if not ignore_extra_fields and len(missing_from_class) > 0:
            raise ValueError(
                f"In file: {path}, extra fields in file missing from class '{cls.__name__}': "
                ", ".join(missing_from_file)
            )

        # read the metric lines
        for lineno, line in enumerate(reader, 2):
            # parse the raw values
            values: List[str] = line.rstrip("\r\n").split("\t")
            if strip_whitespace:
                values = [v.strip() for v in values]

            # raise an exception if there aren't the same number of values as the header
            if len(header) != len(values):
                raise ValueError(
                    f"In file: {path}, expected {len(header)} columns, got {len(values)} on "
                    f"line {lineno}: {line}"
                )

            # build the metric
            instance: Metric[MetricType] = inspect.attr_from(
                cls=cls, kwargs=dict(zip(header, values)), parsers=parsers
            )
            yield instance

to_list classmethod ¶

to_list(value: str) -> List[Any]

Returns a list value split on comma delimeter.

Source code in fgpyo/util/metric.py

@classmethod
def to_list(cls, value: str) -> List[Any]:
    """Returns a list value split on comma delimeter."""
    return [] if value == "" else value.split(",")

values ¶

values() -> Iterator[Any]

An iterator over attribute values in the same order as the header.

Source code in fgpyo/util/metric.py

def values(self) -> Iterator[Any]:
    """An iterator over attribute values in the same order as the header."""
    for field in inspect.get_fields(self.__class__):  # type: ignore[arg-type]
        yield getattr(self, field.name)

write classmethod ¶

write(path: Path, *values: MetricType, threads: Optional[int] = None) -> None

Writes zero or more metrics to the given path.

The header will always be written.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the output file.	required
`values`	`MetricType`	Zero or more metrics.	`()`
`threads`	`Optional[int]`	the number of threads to use when compressing gzip files	`None`

Source code in fgpyo/util/metric.py

@classmethod
def write(cls, path: Path, *values: MetricType, threads: Optional[int] = None) -> None:
    """Writes zero or more metrics to the given path.

    The header will always be written.

    Args:
        path: Path to the output file.
        values: Zero or more metrics.
        threads: the number of threads to use when compressing gzip files

    """
    with MetricWriter[MetricType](path, metric_class=cls, threads=threads) as writer:
        writer.writeall(values)

MetricFileHeader `dataclass` ¶

Header of a file.

A file's header contains an optional preamble, consisting of lines prefixed by a comment character and/or empty lines, and a required row of fieldnames before the data rows begin.

Attributes:

Name	Type	Description
`preamble`	`List[str]`	A list of any lines preceding the fieldnames.
`fieldnames`	`List[str]`	The field names specified in the final line of the header.

Source code in fgpyo/util/metric.py

@dataclass(frozen=True)
class MetricFileHeader:
    """
    Header of a file.

    A file's header contains an optional preamble, consisting of lines prefixed by a comment
    character and/or empty lines, and a required row of fieldnames before the data rows begin.

    Attributes:
        preamble: A list of any lines preceding the fieldnames.
        fieldnames: The field names specified in the final line of the header.
    """

    preamble: List[str]
    fieldnames: List[str]

MetricWriter ¶

Bases: Generic[MetricType], AbstractContextManager

Source code in fgpyo/util/metric.py

class MetricWriter(Generic[MetricType], AbstractContextManager):
    _metric_class: Type[Metric]
    _fieldnames: List[str]
    _fout: TextIOWrapper
    _writer: DictWriter

    def __init__(
        self,
        filename: Union[Path, str],
        metric_class: Type[Metric],
        append: bool = False,
        delimiter: str = "\t",
        include_fields: Optional[List[str]] = None,
        exclude_fields: Optional[List[str]] = None,
        lineterminator: str = "\n",
        threads: Optional[int] = None,
    ) -> None:
        """
        Args:
            filename: Path to the file to write.
            metric_class: Metric class.
            append: If `True`, the file will be appended to. Otherwise, the specified file will be
                overwritten.
            delimiter: The output file delimiter.
            include_fields: If specified, only the listed fieldnames will be included when writing
                records to file. Fields will be written in the order provided.
                May not be used together with `exclude_fields`.
            exclude_fields: If specified, any listed fieldnames will be excluded when writing
                records to file.
                May not be used together with `include_fields`.
            lineterminator: The string used to terminate lines produced by the MetricWriter.
                Default = "\n".
            threads: the number of threads to use when compressing gzip files

        Raises:
            TypeError: If the provided metric class is not a dataclass- or attr-decorated
                subclass of `Metric`.
            AssertionError: If the provided filepath is not writable.
            AssertionError: If `append=True` and the provided file is not readable. (When appending,
                we check to ensure that the header matches the specified metric class. The file must
                be readable to get the header.)
            ValueError: If `append=True` and the provided file is a FIFO (named pipe).
            ValueError: If `append=True` and the provided file does not include a header.
            ValueError: If `append=True` and the header of the provided file does not match the
                specified metric class and the specified include/exclude fields.
        """

        filepath: Path = Path(filename)
        if (filepath.is_fifo() or filepath.is_char_device()) and append:
            raise ValueError("Cannot append to stdout, stderr, or other named pipe or stream")

        ordered_fieldnames: List[str] = _validate_and_generate_final_output_fieldnames(
            metric_class=metric_class,
            include_fields=include_fields,
            exclude_fields=exclude_fields,
        )

        _assert_is_metric_class(metric_class)
        io.assert_path_is_writable(filepath)
        if append:
            io.assert_path_is_readable(filepath)
            _assert_file_header_matches_metric(
                path=filepath,
                metric_class=metric_class,
                ordered_fieldnames=ordered_fieldnames,
                delimiter=delimiter,
            )

        self._metric_class = metric_class
        self._fieldnames = ordered_fieldnames
        self._fout = io.to_writer(filepath, append=append, threads=threads)
        self._writer = DictWriter(
            f=self._fout,
            fieldnames=self._fieldnames,
            delimiter=delimiter,
            lineterminator=lineterminator,
        )

        # If we aren't appending to an existing file, write the header before any rows
        if not append:
            self._writer.writeheader()

    def __enter__(self) -> "MetricWriter":
        return self

    def __exit__(
        self,
        exc_type: Type[BaseException],
        exc_value: BaseException,
        traceback: TracebackType,
    ) -> None:
        self.close()
        super().__exit__(exc_type, exc_value, traceback)

    def close(self) -> None:
        """Close the underlying file handle."""
        self._fout.close()

    def write(self, metric: MetricType) -> None:
        """
        Write a single Metric instance to file.

        The Metric is converted to a dictionary and then written using the underlying
        `csv.DictWriter`. If the `MetricWriter` was created using the `include_fields` or
        `exclude_fields` arguments, the fields of the Metric are subset and/or reordered
        accordingly before writing.

        Args:
            metric: An instance of the specified Metric.

        Raises:
            TypeError: If the provided `metric` is not an instance of the Metric class used to
                parametrize the writer.
        """

        # Serialize the Metric to a dict for writing by the underlying `DictWriter`
        row = {fieldname: val for fieldname, val in metric.formatted_items()}

        # Filter and/or re-order output fields if necessary
        row = {fieldname: row[fieldname] for fieldname in self._fieldnames}

        self._writer.writerow(row)

    def writeall(self, metrics: Iterable[MetricType]) -> None:
        """
        Write multiple Metric instances to file.

        Each Metric is converted to a dictionary and then written using the underlying
        `csv.DictWriter`. If the `MetricWriter` was created using the `include_fields` or
        `exclude_fields` arguments, the attributes of each Metric are subset and/or reordered
        accordingly before writing.

        Args:
            metrics: A sequence of instances of the specified Metric.
        """
        for metric in metrics:
            self.write(metric)

Functions¶

__init__ ¶

__init__(filename: Union[Path, str], metric_class: Type[Metric], append: bool = False, delimiter: str = '\t', include_fields: Optional[List[str]] = None, exclude_fields: Optional[List[str]] = None, lineterminator: str = '\n', threads: Optional[int] = None) -> None

    Args:
        filename: Path to the file to write.
        metric_class: Metric class.
        append: If `True`, the file will be appended to. Otherwise, the specified file will be
            overwritten.
        delimiter: The output file delimiter.
        include_fields: If specified, only the listed fieldnames will be included when writing
            records to file. Fields will be written in the order provided.
            May not be used together with `exclude_fields`.
        exclude_fields: If specified, any listed fieldnames will be excluded when writing
            records to file.
            May not be used together with `include_fields`.
        lineterminator: The string used to terminate lines produced by the MetricWriter.
            Default = "

". threads: the number of threads to use when compressing gzip files

    Raises:
        TypeError: If the provided metric class is not a dataclass- or attr-decorated
            subclass of `Metric`.
        AssertionError: If the provided filepath is not writable.
        AssertionError: If `append=True` and the provided file is not readable. (When appending,
            we check to ensure that the header matches the specified metric class. The file must
            be readable to get the header.)
        ValueError: If `append=True` and the provided file is a FIFO (named pipe).
        ValueError: If `append=True` and the provided file does not include a header.
        ValueError: If `append=True` and the header of the provided file does not match the
            specified metric class and the specified include/exclude fields.

Source code in fgpyo/util/metric.py

def __init__(
    self,
    filename: Union[Path, str],
    metric_class: Type[Metric],
    append: bool = False,
    delimiter: str = "\t",
    include_fields: Optional[List[str]] = None,
    exclude_fields: Optional[List[str]] = None,
    lineterminator: str = "\n",
    threads: Optional[int] = None,
) -> None:
    """
    Args:
        filename: Path to the file to write.
        metric_class: Metric class.
        append: If `True`, the file will be appended to. Otherwise, the specified file will be
            overwritten.
        delimiter: The output file delimiter.
        include_fields: If specified, only the listed fieldnames will be included when writing
            records to file. Fields will be written in the order provided.
            May not be used together with `exclude_fields`.
        exclude_fields: If specified, any listed fieldnames will be excluded when writing
            records to file.
            May not be used together with `include_fields`.
        lineterminator: The string used to terminate lines produced by the MetricWriter.
            Default = "\n".
        threads: the number of threads to use when compressing gzip files

    Raises:
        TypeError: If the provided metric class is not a dataclass- or attr-decorated
            subclass of `Metric`.
        AssertionError: If the provided filepath is not writable.
        AssertionError: If `append=True` and the provided file is not readable. (When appending,
            we check to ensure that the header matches the specified metric class. The file must
            be readable to get the header.)
        ValueError: If `append=True` and the provided file is a FIFO (named pipe).
        ValueError: If `append=True` and the provided file does not include a header.
        ValueError: If `append=True` and the header of the provided file does not match the
            specified metric class and the specified include/exclude fields.
    """

    filepath: Path = Path(filename)
    if (filepath.is_fifo() or filepath.is_char_device()) and append:
        raise ValueError("Cannot append to stdout, stderr, or other named pipe or stream")

    ordered_fieldnames: List[str] = _validate_and_generate_final_output_fieldnames(
        metric_class=metric_class,
        include_fields=include_fields,
        exclude_fields=exclude_fields,
    )

    _assert_is_metric_class(metric_class)
    io.assert_path_is_writable(filepath)
    if append:
        io.assert_path_is_readable(filepath)
        _assert_file_header_matches_metric(
            path=filepath,
            metric_class=metric_class,
            ordered_fieldnames=ordered_fieldnames,
            delimiter=delimiter,
        )

    self._metric_class = metric_class
    self._fieldnames = ordered_fieldnames
    self._fout = io.to_writer(filepath, append=append, threads=threads)
    self._writer = DictWriter(
        f=self._fout,
        fieldnames=self._fieldnames,
        delimiter=delimiter,
        lineterminator=lineterminator,
    )

    # If we aren't appending to an existing file, write the header before any rows
    if not append:
        self._writer.writeheader()

close ¶

close() -> None

Close the underlying file handle.

Source code in fgpyo/util/metric.py

def close(self) -> None:
    """Close the underlying file handle."""
    self._fout.close()

write ¶

write(metric: MetricType) -> None

Write a single Metric instance to file.

The Metric is converted to a dictionary and then written using the underlying csv.DictWriter. If the MetricWriter was created using the include_fields or exclude_fields arguments, the fields of the Metric are subset and/or reordered accordingly before writing.

Parameters:

Name	Type	Description	Default
`metric`	`MetricType`	An instance of the specified Metric.	required

Raises:

Type	Description
`TypeError`	If the provided `metric` is not an instance of the Metric class used to parametrize the writer.

Source code in fgpyo/util/metric.py

def write(self, metric: MetricType) -> None:
    """
    Write a single Metric instance to file.

    The Metric is converted to a dictionary and then written using the underlying
    `csv.DictWriter`. If the `MetricWriter` was created using the `include_fields` or
    `exclude_fields` arguments, the fields of the Metric are subset and/or reordered
    accordingly before writing.

    Args:
        metric: An instance of the specified Metric.

    Raises:
        TypeError: If the provided `metric` is not an instance of the Metric class used to
            parametrize the writer.
    """

    # Serialize the Metric to a dict for writing by the underlying `DictWriter`
    row = {fieldname: val for fieldname, val in metric.formatted_items()}

    # Filter and/or re-order output fields if necessary
    row = {fieldname: row[fieldname] for fieldname in self._fieldnames}

    self._writer.writerow(row)

writeall ¶

writeall(metrics: Iterable[MetricType]) -> None

Write multiple Metric instances to file.

Each Metric is converted to a dictionary and then written using the underlying csv.DictWriter. If the MetricWriter was created using the include_fields or exclude_fields arguments, the attributes of each Metric are subset and/or reordered accordingly before writing.

Parameters:

Name	Type	Description	Default
`metrics`	`Iterable[MetricType]`	A sequence of instances of the specified Metric.	required

Source code in fgpyo/util/metric.py

def writeall(self, metrics: Iterable[MetricType]) -> None:
    """
    Write multiple Metric instances to file.

    Each Metric is converted to a dictionary and then written using the underlying
    `csv.DictWriter`. If the `MetricWriter` was created using the `include_fields` or
    `exclude_fields` arguments, the attributes of each Metric are subset and/or reordered
    accordingly before writing.

    Args:
        metrics: A sequence of instances of the specified Metric.
    """
    for metric in metrics:
        self.write(metric)

Modules¶

string ¶

Functions¶

column_it ¶

column_it(rows: List[List[str]], delimiter: str = ' ') -> str

A simple version of Unix's column utility. This assumes the table is NxM.

Parameters:

Name	Type	Description	Default
`rows`	`List[List[str]]`	the rows to adjust. Each row must have the same number of delimited fields.	required
`delimiter`	`str`	the delimiter for each field in a row.	`' '`

Source code in fgpyo/util/string.py

def column_it(rows: List[List[str]], delimiter: str = " ") -> str:
    """A simple version of Unix's `column` utility.  This assumes the table is NxM.

    Args:
        rows: the rows to adjust.  Each row must have the same number of delimited fields.
        delimiter: the delimiter for each field in a row.
    """
    # get the # of columns
    num_columns = len(rows[0])
    # for each column, find the maximum length of a cell
    max_column_lengths: List[int] = [
        max(len(row[col_i]) for row in rows) for col_i in range(num_columns)
    ]
    # pad each row in the table
    return "\n".join(
        delimiter.join(
            (" " * (max_column_lengths[col_i] - len(row[col_i]))) + row[col_i]
            for col_i in range(num_columns)
        )
        for row in rows
    )

types ¶

Attributes¶

TypeAnnotation `module-attribute` ¶

TypeAnnotation: TypeAlias = Union[type, _GenericAlias, UnionType, GenericAlias]

A function parameter's type annotation may be any of the following: 1) type, when declaring any of the built-in Python types 2) typing._GenericAlias, when declaring generic collection types or union types using pre-PEP 585 and pre-PEP 604 syntax (e.g. List[int], Optional[int], or Union[int, None]) 3) types.UnionType, when declaring union types using PEP604 syntax (e.g. int | None) 4) types.GenericAlias, when declaring generic collection types using PEP 585 syntax (e.g. list[int]) types.GenericAlias is a subclass of type, but typing._GenericAlias and types.UnionType are not and must be considered explicitly.

Functions¶

is_constructible_from_str ¶

is_constructible_from_str(type_: type) -> bool

Returns true if the provided type can be constructed from a string

Source code in fgpyo/util/types.py

def is_constructible_from_str(type_: type) -> bool:
    """Returns true if the provided type can be constructed from a string"""
    try:
        sig = inspect.signature(type_)
        ((argname, _),) = sig.bind(object()).arguments.items()
    except TypeError:  # Can be raised by signature() or Signature.bind().
        return False
    except ValueError:
        # Can be raised for classes, if the relevant info is in `__init__`.
        if not isinstance(type_, type):
            raise
    else:
        if sig.parameters[argname].annotation is str:
            return True
    # FIXME
    # if isinstance(type_, type):
    #     # signature() first checks __new__, if it is present.
    #     return _is_constructible_from_str(type_.__init__(object(), type_))
    return False

is_list_like ¶

is_list_like(type_: type) -> bool

Returns true if the value is a list or list like object

Source code in fgpyo/util/types.py

def is_list_like(type_: type) -> bool:
    """Returns true if the value is a list or list like object"""
    return typing.get_origin(type_) in [list, collections.abc.Iterable, collections.abc.Sequence]

make_enum_parser ¶

make_enum_parser(enum: Type[EnumType]) -> partial

Makes a parser function for enum classes

Source code in fgpyo/util/types.py

def make_enum_parser(enum: Type[EnumType]) -> partial:
    """Makes a parser function for enum classes"""
    return partial(_make_enum_parser_worker, enum)

make_literal_parser ¶

make_literal_parser(literal: Type[LiteralType], parsers: Iterable[Callable[[str], LiteralType]]) -> partial

Generates a parser function for a literal type object and a set of parsers for the possible parsers to that literal type object

Source code in fgpyo/util/types.py

def make_literal_parser(
    literal: Type[LiteralType], parsers: Iterable[Callable[[str], LiteralType]]
) -> partial:
    """Generates a parser function for a literal type object and a set of parsers for the possible
    parsers to that literal type object
    """
    return partial(_make_literal_parser_worker, literal, parsers)

make_union_parser ¶

make_union_parser(union: Type[UnionType], parsers: Iterable[Callable[[str], UnionType]]) -> partial

Generates a parser function for a union type object and set of parsers for the possible parsers to that union type object

Source code in fgpyo/util/types.py

def make_union_parser(
    union: Type[UnionType], parsers: Iterable[Callable[[str], UnionType]]
) -> partial:
    """Generates a parser function for a union type object and set of parsers for the possible
    parsers to that union type object
    """
    return partial(_make_union_parser_worker, union, parsers)

none_parser ¶

none_parser(value: str) -> Literal[None]

Returns None if the value is 'None', else raises an error

Source code in fgpyo/util/types.py

def none_parser(value: str) -> Literal[None]:
    """Returns None if the value is 'None', else raises an error"""
    if value == "":
        return None
    raise ValueError(f"NoneType not a valid type for {value}")

parse_bool ¶

parse_bool(string: str) -> bool

Parses strings into bools accounting for the many different text representations of bools that can be used

Source code in fgpyo/util/types.py

def parse_bool(string: str) -> bool:
    """Parses strings into bools accounting for the many different text representations of bools
    that can be used
    """
    if string.lower() in ["t", "true", "1"]:
        return True
    elif string.lower() in ["f", "false", "0"]:
        return False
    else:
        raise ValueError("{} is not a valid boolean string".format(string))

util

Modules¶

inspect ¶

Attributes¶

FieldType module-attribute ¶

Functions¶

attr_from ¶

dict_parser ¶

get_fields ¶

get_fields_dict ¶

is_attr_class ¶

list_parser ¶

set_parser ¶

split_at_given_level ¶

tuple_parser ¶

Modules¶

logging ¶

Methods for setting up logging for tools.¶

Progress Logging Examples¶

Classes¶

ProgressLogger ¶

Functions¶

Functions¶

setup_logging ¶

metric ¶

Metrics¶

Examples¶

Classes¶

Metric ¶

Functions¶

MetricFileHeader dataclass ¶

MetricWriter ¶

Functions¶

Modules¶

string ¶

Functions¶

column_it ¶

types ¶

Attributes¶

TypeAnnotation module-attribute ¶

Functions¶

is_constructible_from_str ¶

is_list_like ¶

make_enum_parser ¶

make_literal_parser ¶

make_union_parser ¶

none_parser ¶

parse_bool ¶

FieldType `module-attribute` ¶

MetricFileHeader `dataclass` ¶

TypeAnnotation `module-attribute` ¶