Skip to content

sequence_dictionary

Classes for representing sequencing dictionaries.

Examples of building and using sequence dictionaries

Building a sequence dictionary from a pysam.AlignmentHeader:

>>> import pysam
>>> from fgpyo.fasta.sequence_dictionary import SequenceDictionary
>>> sd: SequenceDictionary
>>> with pysam.AlignmentFile("./tests/fgpyo/sam/data/valid.sam") as fh:
...     sd = SequenceDictionary.from_sam(fh.header)
>>> print(sd)  
@SQ     SN:chr1 LN:101
@SQ     SN:chr2 LN:101
@SQ     SN:chr3 LN:101
@SQ     SN:chr4 LN:101
@SQ     SN:chr5 LN:101
@SQ     SN:chr6 LN:101
@SQ     SN:chr7 LN:404
@SQ     SN:chr8 LN:202

Query based on index:

>>> print(sd[3])  
@SQ     SN:chr4 LN:101

Query based on name:

>>> print(sd["chr6"])  
@SQ     SN:chr6 LN:101

Add, get, and delete attributes:

>>> from fgpyo.fasta.sequence_dictionary import Keys
>>> meta = sd[0]
>>> print(meta)  
@SQ     SN:chr1 LN:101
>>> meta[Keys.ASSEMBLY] = "hg38"
>>> print(meta)  
@SQ     SN:chr1 LN:101  AS:hg38
>>> meta.get(Keys.ASSEMBLY)
'hg38'
>>> meta.get(Keys.SPECIES) is None
True
>>> Keys.MD5 in meta
False
>>> del meta[Keys.ASSEMBLY]
>>> print(meta)  
@SQ     SN:chr1 LN:101

Get a sequence based on one of its aliases

>>> meta[Keys.ALIASES] = "foo,bar,car"
>>> sd = SequenceDictionary(infos=[meta] + sd.infos[1:])
>>> print(sd)  
@SQ     SN:chr1 LN:101  AN:foo,bar,car
@SQ     SN:chr2 LN:101
@SQ     SN:chr3 LN:101
@SQ     SN:chr4 LN:101
@SQ     SN:chr5 LN:101
@SQ     SN:chr6 LN:101
@SQ     SN:chr7 LN:404
@SQ     SN:chr8 LN:202
>>> print(sd["chr1"])  
@SQ     SN:chr1 LN:101  AN:foo,bar,car
>>> print(sd["bar"])  
@SQ     SN:chr1 LN:101  AN:foo,bar,car

Create a pysam.AlignmentHeader from a sequence dictionary:

>>> sd.to_sam_header()  
<pysam.libcalignmentfile.AlignmentHeader object at ...>
>>> print(sd.to_sam_header())  
@HD     VN:1.5
@SQ     SN:chr1 LN:101  AN:foo,bar,car
@SQ     SN:chr2 LN:101
@SQ     SN:chr3 LN:101
@SQ     SN:chr4 LN:101
@SQ     SN:chr5 LN:101
@SQ     SN:chr6 LN:101
@SQ     SN:chr7 LN:404
@SQ     SN:chr8 LN:202

Create a pysam.AlignmentHeader from a sequence dictionary with extra header items:

>>> sd.to_sam_header(
...     extra_header={"RG": [{"ID": "A", "LB": "a-library"}, {"ID": "B", "LB": "b-library"}]}
... )  
<pysam.libcalignmentfile.AlignmentHeader object at ...>
>>> print(sd.to_sam_header(
...     extra_header={"RG": [{"ID": "A", "LB": "a-library"}, {"ID": "B", "LB": "b-library"}]}
... ))  
@HD     VN:1.5
@SQ     SN:chr1 LN:101  AN:foo,bar,car
@SQ     SN:chr2 LN:101
@SQ     SN:chr3 LN:101
@SQ     SN:chr4 LN:101
@SQ     SN:chr5 LN:101
@SQ     SN:chr6 LN:101
@SQ     SN:chr7 LN:404
@SQ     SN:chr8 LN:202
@RG     ID:A    LB:a-library
@RG     ID:B    LB:b-library

Attributes

SEQUENCE_NAME_PATTERN module-attribute

SEQUENCE_NAME_PATTERN: Pattern = compile('^[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*$')

Regular expression for valid reference sequence names according to the SAM spec

Classes

AlternateLocus dataclass

Stores an alternate locus for an associated sequence (1-based inclusive)

Source code in fgpyo/fasta/sequence_dictionary.py
@dataclass(frozen=True, init=True)
class AlternateLocus:
    """Stores an alternate locus for an associated sequence (1-based inclusive)"""

    name: str
    start: int
    end: int

    def __post_init__(self) -> None:
        """Any post initialization validation should go here"""
        if self.start > self.end:
            raise ValueError(f"start > end: {self.start} > {self.end}")
        if self.start < 1:
            raise ValueError(f"start < 1: {self.start}")

    def __str__(self) -> str:
        return f"{self.name}:{self.start}-{self.end}"

    def __len__(self) -> int:
        return self.end - self.start + 1

    @staticmethod
    def parse(value: str) -> "AlternateLocus":
        """Parse the genomic interval of format: `<contig>:<start>-<end>`"""
        name, rest = value.split(":", maxsplit=1)
        start, end = rest.split("-", maxsplit=1)
        return AlternateLocus(name=name, start=int(start), end=int(end))

Functions

__post_init__
__post_init__() -> None

Any post initialization validation should go here

Source code in fgpyo/fasta/sequence_dictionary.py
def __post_init__(self) -> None:
    """Any post initialization validation should go here"""
    if self.start > self.end:
        raise ValueError(f"start > end: {self.start} > {self.end}")
    if self.start < 1:
        raise ValueError(f"start < 1: {self.start}")
parse staticmethod
parse(value: str) -> AlternateLocus

Parse the genomic interval of format: <contig>:<start>-<end>

Source code in fgpyo/fasta/sequence_dictionary.py
@staticmethod
def parse(value: str) -> "AlternateLocus":
    """Parse the genomic interval of format: `<contig>:<start>-<end>`"""
    name, rest = value.split(":", maxsplit=1)
    start, end = rest.split("-", maxsplit=1)
    return AlternateLocus(name=name, start=int(start), end=int(end))

Keys

Bases: StrEnum

Enumeration of tags/attributes available on a sequence record/metadata (SAM @SQ line).

Source code in fgpyo/fasta/sequence_dictionary.py
@unique
class Keys(StrEnum):
    """Enumeration of tags/attributes available on a sequence record/metadata (SAM @SQ line)."""

    ALIASES = "AN"
    ALTERNATE_LOCUS = "AH"
    ASSEMBLY = "AS"
    DESCRIPTION = "DS"
    SEQUENCE_LENGTH = "LN"
    MD5 = "M5"
    SEQUENCE_NAME = "SN"
    SPECIES = "SP"
    TOPOLOGY = "TP"
    URI = "UR"

    @staticmethod
    def attributes() -> List[str]:
        """The list of keys that are allowed to be attributes in `SequenceMetadata`.  Notably
        `SEQUENCE_LENGTH` and `SEQUENCE_NAME` are not allowed."""
        return [key for key in Keys if key != Keys.SEQUENCE_NAME and key != Keys.SEQUENCE_LENGTH]

Functions

attributes staticmethod
attributes() -> List[str]

The list of keys that are allowed to be attributes in SequenceMetadata. Notably SEQUENCE_LENGTH and SEQUENCE_NAME are not allowed.

Source code in fgpyo/fasta/sequence_dictionary.py
@staticmethod
def attributes() -> List[str]:
    """The list of keys that are allowed to be attributes in `SequenceMetadata`.  Notably
    `SEQUENCE_LENGTH` and `SEQUENCE_NAME` are not allowed."""
    return [key for key in Keys if key != Keys.SEQUENCE_NAME and key != Keys.SEQUENCE_LENGTH]

SequenceDictionary dataclass

Bases: Mapping[Union[str, int], SequenceMetadata]

Contains an ordered collection of sequences.

A specific SequenceMetadata may be retrieved by name (str) or index (int), either by using the generic get method or by the correspondingly named by_name and by_index methods. The latter methods provide faster retrieval when the type is known.

This mapping collection iterates over the keys. To iterate over each SequenceMetadata, either use the typical values() method or access the metadata directly with infos.

Attributes:

Name Type Description
infos List[SequenceMetadata]

the ordered collection of sequence metadata

Source code in fgpyo/fasta/sequence_dictionary.py
@dataclass(frozen=True, init=True)
class SequenceDictionary(Mapping[Union[str, int], SequenceMetadata]):
    """Contains an ordered collection of sequences.

    A specific `SequenceMetadata` may be retrieved by name (`str`) or index (`int`), either by
    using the generic `get` method or by the correspondingly named `by_name` and `by_index` methods.
    The latter methods provide faster retrieval when the type is known.

    This _mapping_ collection iterates over the _keys_.  To iterate over each `SequenceMetadata`,
    either use the typical `values()` method or access the metadata directly with `infos`.

    Attributes:
        infos: the ordered collection of sequence metadata
    """

    infos: List[SequenceMetadata]
    _dict: Dict[str, SequenceMetadata] = field(init=False, repr=False)

    def __post_init__(self) -> None:
        # Initialize a mapping from sequence name to the sequence metadata for all names
        self_dict: Dict[str, SequenceMetadata] = {}
        for index, info in enumerate(self.infos):
            if info.index != index:
                raise ValueError(
                    "Infos must be given with index set correctly."
                    + f"  See ${index}th with name: {info.name}"
                )
            for name in info.all_names:
                if name in self_dict:
                    raise ValueError(f"Found duplicate sequence name: {name}")
                self_dict[name] = info
        object.__setattr__(self, "_dict", self_dict)

    def same_as(self, other: "SequenceDictionary") -> bool:
        """Returns true if the sequences share a common reference name (including aliases), have
        the same length, and the same MD5 if both have MD5s"""
        if len(self) != len(other):
            return False
        return all(this.same_as(that) for this, that in zip(self.infos, other.infos))

    def to_sam(self) -> List[Dict[str, Any]]:
        """Converts the list of dictionaries, one per sequence."""
        return [meta.to_sam() for meta in self.infos]

    def to_sam_header(
        self,
        extra_header: Optional[Dict[str, Any]] = None,
    ) -> pysam.AlignmentHeader:
        """Converts the sequence dictionary to a `pysam.AlignmentHeader`.

        Args:
            extra_header: a dictionary of extra values to add to the header, None otherwise.  See
                          `:~pysam.AlignmentHeader` for more details.
        """
        header_dict: Dict[str, Any] = {
            "HD": {"VN": "1.5"},
            "SQ": self.to_sam(),
        }
        if extra_header is not None:
            header_dict = {**header_dict, **extra_header}
        return pysam.AlignmentHeader.from_dict(header_dict=header_dict)

    @staticmethod
    @overload
    def from_sam(data: Path) -> "SequenceDictionary": ...

    @staticmethod
    @overload
    def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ...

    @staticmethod
    @overload
    def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ...

    @staticmethod
    @overload
    def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ...

    @staticmethod
    def from_sam(
        data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]],
    ) -> "SequenceDictionary":
        """Creates a `SequenceDictionary` from a SAM file or its header.

        Args:
            data: The input may be any of:
                - a path to a SAM file
                - an open `pysam.AlignmentFile`
                - the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile`
                - the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()`
        Returns:
            A `SequenceDictionary` mapping refrence names to their metadata.
        """
        seq_dict: SequenceDictionary
        if isinstance(data, pysam.AlignmentHeader):
            seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"])
        elif isinstance(data, pysam.AlignmentFile):
            seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"])
        elif isinstance(data, Path):
            with sam.reader(data) as fh:
                seq_dict = SequenceDictionary.from_sam(fh.header)
        else:  # assuming `data` is a `list[dict[str, Any]]`
            try:
                infos: List[SequenceMetadata] = [
                    SequenceMetadata.from_sam(meta=meta, index=index)
                    for index, meta in enumerate(data)
                ]
                seq_dict = SequenceDictionary(infos=infos)
            except Exception as e:
                raise ValueError(f"Could not parse sequence information from data: {data}") from e

        return seq_dict

    def __getitem__(self, key: Union[str, int]) -> SequenceMetadata:
        return self._dict[key] if isinstance(key, str) else self.infos[key]

    def get_by_name(self, name: str) -> Optional[SequenceMetadata]:
        """Gets a `SequenceMetadata` explicitly by `name`.  Returns None if
        the name does not exist in this dictionary"""
        return self._dict.get(name)

    def by_name(self, name: str) -> SequenceMetadata:
        """Gets a `SequenceMetadata` explicitly by `name`.  The name must exist."""
        return self._dict[name]

    def by_index(self, index: int) -> SequenceMetadata:
        """Gets a `SequenceMetadata` explicitly by `name`.  Raises an `IndexError`
        if the index is out of bounds."""
        return self.infos[index]

    def __iter__(self) -> Iterator[str]:
        return iter(self._dict)

    def __len__(self) -> int:
        return len(self.infos)

    def __str__(self) -> str:
        return "\n".join(f"{info}" for info in self.infos)

Functions

by_index
by_index(index: int) -> SequenceMetadata

Gets a SequenceMetadata explicitly by name. Raises an IndexError if the index is out of bounds.

Source code in fgpyo/fasta/sequence_dictionary.py
def by_index(self, index: int) -> SequenceMetadata:
    """Gets a `SequenceMetadata` explicitly by `name`.  Raises an `IndexError`
    if the index is out of bounds."""
    return self.infos[index]
by_name
by_name(name: str) -> SequenceMetadata

Gets a SequenceMetadata explicitly by name. The name must exist.

Source code in fgpyo/fasta/sequence_dictionary.py
def by_name(self, name: str) -> SequenceMetadata:
    """Gets a `SequenceMetadata` explicitly by `name`.  The name must exist."""
    return self._dict[name]
from_sam staticmethod
from_sam(data: Path) -> SequenceDictionary
from_sam(data: AlignmentFile) -> SequenceDictionary
from_sam(data: AlignmentHeader) -> SequenceDictionary
from_sam(data: List[Dict[str, Any]]) -> SequenceDictionary
from_sam(data: Union[Path, AlignmentFile, AlignmentHeader, List[Dict[str, Any]]]) -> SequenceDictionary

Creates a SequenceDictionary from a SAM file or its header.

Parameters:

Name Type Description Default
data Union[Path, AlignmentFile, AlignmentHeader, List[Dict[str, Any]]]

The input may be any of: - a path to a SAM file - an open pysam.AlignmentFile - the pysam.AlignmentHeader associated with a pysam.AlignmentFile - the contents of a header's SQ fields, as returned by AlignmentHeader.to_dict()

required

Returns: A SequenceDictionary mapping refrence names to their metadata.

Source code in fgpyo/fasta/sequence_dictionary.py
@staticmethod
def from_sam(
    data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]],
) -> "SequenceDictionary":
    """Creates a `SequenceDictionary` from a SAM file or its header.

    Args:
        data: The input may be any of:
            - a path to a SAM file
            - an open `pysam.AlignmentFile`
            - the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile`
            - the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()`
    Returns:
        A `SequenceDictionary` mapping refrence names to their metadata.
    """
    seq_dict: SequenceDictionary
    if isinstance(data, pysam.AlignmentHeader):
        seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"])
    elif isinstance(data, pysam.AlignmentFile):
        seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"])
    elif isinstance(data, Path):
        with sam.reader(data) as fh:
            seq_dict = SequenceDictionary.from_sam(fh.header)
    else:  # assuming `data` is a `list[dict[str, Any]]`
        try:
            infos: List[SequenceMetadata] = [
                SequenceMetadata.from_sam(meta=meta, index=index)
                for index, meta in enumerate(data)
            ]
            seq_dict = SequenceDictionary(infos=infos)
        except Exception as e:
            raise ValueError(f"Could not parse sequence information from data: {data}") from e

    return seq_dict
get_by_name
get_by_name(name: str) -> Optional[SequenceMetadata]

Gets a SequenceMetadata explicitly by name. Returns None if the name does not exist in this dictionary

Source code in fgpyo/fasta/sequence_dictionary.py
def get_by_name(self, name: str) -> Optional[SequenceMetadata]:
    """Gets a `SequenceMetadata` explicitly by `name`.  Returns None if
    the name does not exist in this dictionary"""
    return self._dict.get(name)
same_as
same_as(other: SequenceDictionary) -> bool

Returns true if the sequences share a common reference name (including aliases), have the same length, and the same MD5 if both have MD5s

Source code in fgpyo/fasta/sequence_dictionary.py
def same_as(self, other: "SequenceDictionary") -> bool:
    """Returns true if the sequences share a common reference name (including aliases), have
    the same length, and the same MD5 if both have MD5s"""
    if len(self) != len(other):
        return False
    return all(this.same_as(that) for this, that in zip(self.infos, other.infos))
to_sam
to_sam() -> List[Dict[str, Any]]

Converts the list of dictionaries, one per sequence.

Source code in fgpyo/fasta/sequence_dictionary.py
def to_sam(self) -> List[Dict[str, Any]]:
    """Converts the list of dictionaries, one per sequence."""
    return [meta.to_sam() for meta in self.infos]
to_sam_header
to_sam_header(extra_header: Optional[Dict[str, Any]] = None) -> AlignmentHeader

Converts the sequence dictionary to a pysam.AlignmentHeader.

Parameters:

Name Type Description Default
extra_header Optional[Dict[str, Any]]

a dictionary of extra values to add to the header, None otherwise. See :~pysam.AlignmentHeader for more details.

None
Source code in fgpyo/fasta/sequence_dictionary.py
def to_sam_header(
    self,
    extra_header: Optional[Dict[str, Any]] = None,
) -> pysam.AlignmentHeader:
    """Converts the sequence dictionary to a `pysam.AlignmentHeader`.

    Args:
        extra_header: a dictionary of extra values to add to the header, None otherwise.  See
                      `:~pysam.AlignmentHeader` for more details.
    """
    header_dict: Dict[str, Any] = {
        "HD": {"VN": "1.5"},
        "SQ": self.to_sam(),
    }
    if extra_header is not None:
        header_dict = {**header_dict, **extra_header}
    return pysam.AlignmentHeader.from_dict(header_dict=header_dict)

SequenceMetadata dataclass

Bases: MutableMapping[Union[Keys, str], str]

Stores information about a single Sequence (ex. chromosome, contig).

Implements the mutable mapping interface, which provides access to the attributes of this sequence, including name, length, but not index. When using the mapping interface, for example getting, setting, deleting, as well as iterating over keys, values, and items, the values will always be strings (str type). For example, the length will be an str when accessing via get; access the length directly or use len to return an int. Similarly, use the alias property to return a List[str] of aliases, use the alternate property to return an AlternativeLocus-typed instance, and topology property to return a Toplogy-typed instance.

All attributes except name and length may be set. Use dataclasses.replace to create a new copy in such cases.

Important: The len method returns the length of the sequence, not the length of the attributes. Use len(meta.attributes) for the latter.

Attributes:

Name Type Description
name str

the primary name of the sequence

length int

the length of the sequence, or zero if unknown

index int

the index in the sequence dictionary

attributes Dict[Union[Keys, str], str]

attributes of this sequence

Source code in fgpyo/fasta/sequence_dictionary.py
@dataclass(frozen=True, init=True)
class SequenceMetadata(MutableMapping[Union[Keys, str], str]):
    """Stores information about a single Sequence (ex. chromosome, contig).

    Implements the mutable mapping interface, which provides access to the attributes of this
    sequence, including name, length, but not index.  When using the mapping interface, for example
    getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will
    always be strings (`str` type).  For example, the length will be an `str` when accessing via
    `get`; access the length directly or use `len` to return an `int`.  Similarly, use the
    `alias` property to return a `List[str]` of aliases, use the `alternate` property to return
    an `AlternativeLocus`-typed instance, and `topology` property to return a `Toplogy`-typed
    instance.

    All attributes except name and length may be set.  Use `dataclasses.replace` to create a new
    copy in such cases.

    Important: The `len` method returns the length of the sequence, not the length of the
    attributes.  Use `len(meta.attributes)` for the latter.

    Attributes:
      name: the primary name of the sequence
      length: the length of the sequence, or zero if unknown
      index: the index in the sequence dictionary
      attributes: attributes of this sequence
    """

    name: str
    length: int
    index: int
    attributes: Dict[Union[Keys, str], str] = field(default_factory=dict)

    def __post_init__(self) -> None:
        """Any post initialization validation should go here"""
        if self.length < 0:
            raise ValueError(f"Length must be >= 0 for '{self.name}'")
        if re.search(SEQUENCE_NAME_PATTERN, self.name) is None:
            raise ValueError(f"Illegal name: '{self.name}'")
        if Keys.SEQUENCE_NAME in self.attributes:
            raise ValueError(f"'{Keys.SEQUENCE_NAME}' should not given in the list of attributes")
        if Keys.SEQUENCE_LENGTH in self.attributes:
            raise ValueError(f"'{Keys.SEQUENCE_LENGTH}' should not given in the list of attributes")

    @property
    def aliases(self) -> List[str]:
        """The aliases (not including the primary) name"""
        aliases = self.attributes.get(Keys.ALIASES)
        return [] if aliases is None else aliases.split(",")

    @property
    def all_names(self) -> List[str]:
        """A list of all names, including the primary name and aliases, in that order."""
        return [self.name] + self.aliases

    @property
    def alternate(self) -> Optional[AlternateLocus]:
        """Gets the alternate locus for this sequence"""
        if Keys.ALTERNATE_LOCUS not in self.attributes:
            return None
        value = self.attributes[Keys.ALTERNATE_LOCUS]
        if value == "*":
            return None
        locus = AlternateLocus.parse(value)
        if locus.name == "=":
            locus = replace(locus, name=self.name)
        return locus

    @property
    def is_alternate(self) -> bool:
        """True if there is an alternate locus defined, False otherwise"""
        return self.alternate is not None

    @property
    def md5(self) -> Optional[str]:
        return self.get(Keys.MD5)

    @property
    def assembly(self) -> Optional[str]:
        return self.get(Keys.ASSEMBLY)

    @property
    def uri(self) -> Optional[str]:
        return self.get(Keys.URI)

    @property
    def species(self) -> Optional[str]:
        return self.get(Keys.SPECIES)

    @property
    def description(self) -> Optional[str]:
        return self.get(Keys.DESCRIPTION)

    @property
    def topology(self) -> Optional[Topology]:
        value = self.get(Keys.TOPOLOGY)
        return None if value is None else Topology[value]

    def same_as(self, other: "SequenceMetadata") -> bool:
        """Returns true if the sequences share a common reference name (including aliases), have
        the same length, and the same MD5 if both have MD5s."""
        if self.length != other.length:
            return False
        elif self.name != other.name and other.name not in self.all_names:
            return False
        self_m5 = self.md5
        other_m5 = other.md5
        if self_m5 is None or other_m5 is None:
            return True
        else:
            return self_m5 == other_m5

    def to_sam(self) -> Dict[str, Any]:
        """Converts the sequence metadata to a dictionary equivalent to one item in the
        list of sequences from `pysam.AlignmentHeader#to_dict()["SQ"]`."""
        meta_dict: Dict[str, Any] = {
            f"{Keys.SEQUENCE_NAME}": self.name,
            f"{Keys.SEQUENCE_LENGTH}": self.length,
        }
        if len(self.attributes) > 0:
            meta_dict = {**meta_dict, **self.attributes}

        return meta_dict

    @staticmethod
    def from_sam(meta: Dict[Union[Keys, str], Any], index: int) -> "SequenceMetadata":
        """Builds a `SequenceMetadata` from a dictionary.  The keys must include the sequence
        name (`Keys.SEQUENCE_NAME`) and length (`Keys.SEQUENCE_LENGTH`).  All other keys from
        `Keys` will be stored in the resulting attributes.

        Args:
            meta: the python dictionary with keys from `Keys`.  This is typically the dictionary
                  stored in the `"SQ"` level of the two-level dictionary returned by the
                  `pysam.AlignmentHeader#to_dict()` method.
            index: the 0-based index to use for this sequence
        """
        name = meta[Keys.SEQUENCE_NAME]
        length = meta[Keys.SEQUENCE_LENGTH]
        attributes = copy.deepcopy(meta)
        del attributes[Keys.SEQUENCE_NAME]
        del attributes[Keys.SEQUENCE_LENGTH]
        return SequenceMetadata(name=name, length=length, index=index, attributes=attributes)

    def __getitem__(self, key: Union[Keys, str]) -> Any:
        if key == Keys.SEQUENCE_NAME.value:
            return self.name
        elif key == Keys.SEQUENCE_LENGTH.value:
            return f"{self.length}"
        return self.attributes[key]

    def __setitem__(self, key: Union[Keys, str], value: str) -> None:
        if key == Keys.SEQUENCE_NAME or key == Keys.SEQUENCE_LENGTH:
            raise KeyError(f"Cannot set '{key}' on SequenceMetadata with name '{self.name}'")
        self.attributes[key] = value

    def __delitem__(self, key: Union[Keys, str]) -> None:
        if key == Keys.SEQUENCE_NAME or key == Keys.SEQUENCE_LENGTH:
            raise KeyError(f"Cannot delete '{key}' on SequenceMetadata with name '{self.name}'")
        del self.attributes[key]

    def __iter__(self) -> Iterator[Union[Keys, str]]:
        pre_iter = iter((Keys.SEQUENCE_NAME, Keys.SEQUENCE_LENGTH))
        return itertools.chain(pre_iter, iter(self.attributes))

    def __len__(self) -> int:
        return self.length

    def __str__(self) -> str:
        return "@SQ\t" + "\t".join(f"{key}:{value}" for key, value in self.to_sam().items())

    def __index__(self) -> int:
        return self.index

Attributes

aliases property
aliases: List[str]

The aliases (not including the primary) name

all_names property
all_names: List[str]

A list of all names, including the primary name and aliases, in that order.

alternate property
alternate: Optional[AlternateLocus]

Gets the alternate locus for this sequence

is_alternate property
is_alternate: bool

True if there is an alternate locus defined, False otherwise

Functions

__post_init__
__post_init__() -> None

Any post initialization validation should go here

Source code in fgpyo/fasta/sequence_dictionary.py
def __post_init__(self) -> None:
    """Any post initialization validation should go here"""
    if self.length < 0:
        raise ValueError(f"Length must be >= 0 for '{self.name}'")
    if re.search(SEQUENCE_NAME_PATTERN, self.name) is None:
        raise ValueError(f"Illegal name: '{self.name}'")
    if Keys.SEQUENCE_NAME in self.attributes:
        raise ValueError(f"'{Keys.SEQUENCE_NAME}' should not given in the list of attributes")
    if Keys.SEQUENCE_LENGTH in self.attributes:
        raise ValueError(f"'{Keys.SEQUENCE_LENGTH}' should not given in the list of attributes")
from_sam staticmethod
from_sam(meta: Dict[Union[Keys, str], Any], index: int) -> SequenceMetadata

Builds a SequenceMetadata from a dictionary. The keys must include the sequence name (Keys.SEQUENCE_NAME) and length (Keys.SEQUENCE_LENGTH). All other keys from Keys will be stored in the resulting attributes.

Parameters:

Name Type Description Default
meta Dict[Union[Keys, str], Any]

the python dictionary with keys from Keys. This is typically the dictionary stored in the "SQ" level of the two-level dictionary returned by the pysam.AlignmentHeader#to_dict() method.

required
index int

the 0-based index to use for this sequence

required
Source code in fgpyo/fasta/sequence_dictionary.py
@staticmethod
def from_sam(meta: Dict[Union[Keys, str], Any], index: int) -> "SequenceMetadata":
    """Builds a `SequenceMetadata` from a dictionary.  The keys must include the sequence
    name (`Keys.SEQUENCE_NAME`) and length (`Keys.SEQUENCE_LENGTH`).  All other keys from
    `Keys` will be stored in the resulting attributes.

    Args:
        meta: the python dictionary with keys from `Keys`.  This is typically the dictionary
              stored in the `"SQ"` level of the two-level dictionary returned by the
              `pysam.AlignmentHeader#to_dict()` method.
        index: the 0-based index to use for this sequence
    """
    name = meta[Keys.SEQUENCE_NAME]
    length = meta[Keys.SEQUENCE_LENGTH]
    attributes = copy.deepcopy(meta)
    del attributes[Keys.SEQUENCE_NAME]
    del attributes[Keys.SEQUENCE_LENGTH]
    return SequenceMetadata(name=name, length=length, index=index, attributes=attributes)
same_as
same_as(other: SequenceMetadata) -> bool

Returns true if the sequences share a common reference name (including aliases), have the same length, and the same MD5 if both have MD5s.

Source code in fgpyo/fasta/sequence_dictionary.py
def same_as(self, other: "SequenceMetadata") -> bool:
    """Returns true if the sequences share a common reference name (including aliases), have
    the same length, and the same MD5 if both have MD5s."""
    if self.length != other.length:
        return False
    elif self.name != other.name and other.name not in self.all_names:
        return False
    self_m5 = self.md5
    other_m5 = other.md5
    if self_m5 is None or other_m5 is None:
        return True
    else:
        return self_m5 == other_m5
to_sam
to_sam() -> Dict[str, Any]

Converts the sequence metadata to a dictionary equivalent to one item in the list of sequences from pysam.AlignmentHeader#to_dict()["SQ"].

Source code in fgpyo/fasta/sequence_dictionary.py
def to_sam(self) -> Dict[str, Any]:
    """Converts the sequence metadata to a dictionary equivalent to one item in the
    list of sequences from `pysam.AlignmentHeader#to_dict()["SQ"]`."""
    meta_dict: Dict[str, Any] = {
        f"{Keys.SEQUENCE_NAME}": self.name,
        f"{Keys.SEQUENCE_LENGTH}": self.length,
    }
    if len(self.attributes) > 0:
        meta_dict = {**meta_dict, **self.attributes}

    return meta_dict

Topology

Bases: StrEnum

Enumeration for the topology of reference sequences (SAM @SQ.TP)

Source code in fgpyo/fasta/sequence_dictionary.py
@unique
class Topology(StrEnum):
    """Enumeration for the topology of reference sequences (SAM @SQ.TP)"""

    LINEAR = "LINEAR"
    CIRCULAR = "CIRCULAR"

Modules