Source code for bsxplorer.MetageneClasses

from __future__ import annotations

from pathlib import Path
from typing import Literal

import numpy as np
import polars as pl
import seaborn as sns
from scipy import stats

from .Plots import (
    BoxPlot, savgol_line, LinePlotData, LinePlot, plot_stat_expr, HeatMapData, HeatMap,
    BoxPlotData
)
from .SeqMapper import CytosinesFileCM, SequenceFile
from .Base import (
    MetageneBase,
    MetageneFilesBase,
    read_metagene,
    validate_metagene_args
)
from .Clusters import ClusterSingle, ClusterMany
from .UniversalReader_batches import ReportTypes
from .utils import MetageneSchema, AvailableSumfunc, CONTEXTS
from .GenomeClass import Genome
from .UniversalReader_classes import UniversalReader


[docs]class Metagene(MetageneBase): """ Stores Metagene data. """
[docs] def __init__(self, report_df: pl.DataFrame, **kwargs): super().__init__(report_df, **kwargs)
[docs] @classmethod def from_bismark( cls, file: str | Path, genome: pl.DataFrame, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, block_size_mb: int = 100, use_threads: bool = True, sumfunc: AvailableSumfunc = "wmean" ): """ Constructor for Metagene class from Bismark ``coverage2cytosine`` report. Parameters ---------- file Path to bismark genomeWide report. genome ``polars.Dataframe`` with gene ranges (from :class:`Genome`) up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into block_size_mb Block size for reading. (Block size ≠ amount of RAM used. Reader allocates approx. Block size * 20 memory for reading.) use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` sumfunc Summary function to calculate density for window with. Returns ------- Metagene Examples -------- >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/bismark.CX_report.txt' >>> metagene = Metagene.from_bismark(path, genome, up_windows=500, body_windows=1000, down_windows=500) """ reader = UniversalReader(**(locals() | dict(report_type="bismark"))) args = validate_metagene_args(genome, up_windows, body_windows, down_windows, sumfunc) report_df = read_metagene(**(locals() | args)) return cls(report_df, upstream_windows=args["upstream_windows"], gene_windows=args["body_windows"], downstream_windows=args["downstream_windows"])
[docs] @classmethod def from_cgmap( cls, file: str | Path, genome: pl.DataFrame, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, block_size_mb: int = 100, use_threads: bool = True, sumfunc: AvailableSumfunc = "wmean" ): """ Constructor for Metagene class from BSSeeker2 CGmap file. Parameters ---------- file Path to CGmap file report. genome ``polars.Dataframe`` with gene ranges (from :class:`Genome`) up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into block_size_mb Block size for reading. (Block size ≠ amount of RAM used. Reader allocates approx. Block size * 20 memory for reading.) use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` sumfunc Summary function to calculate density for window with. Returns ------- Metagene Examples -------- >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/CGmap.txt' >>> metagene = Metagene.from_cgmap(path, genome, up_windows=500, body_windows=1000, down_windows=500) """ reader = UniversalReader(**(locals() | dict(report_type="cgmap"))) args = validate_metagene_args(genome, up_windows, body_windows, down_windows, sumfunc) report_df = read_metagene(**(locals() | args)) return cls(report_df, upstream_windows=args["upstream_windows"], gene_windows=args["body_windows"], downstream_windows=args["downstream_windows"])
[docs] @classmethod def from_binom( cls, file: str | Path, genome: pl.DataFrame, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, p_value: float = .05, use_threads=True, ): """ Constructor for Metagene class from :meth:`BinomialData.preprocess` ``.parquet`` file. Only ``"mean"`` summary function is supported for construction :class:`Metagene` from binom data. Parameters ---------- p_value P-value of cytosine methylation for it to be considered methylated. file Path to preprocessed `.parquet` file. genome ``polars.Dataframe`` with gene ranges (from :class:`Genome`) up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` Returns ------- Metagene Examples -------- >>> save_name = "preprocessed.parquet" >>> BinomialData.preprocess("path/to/bismark.CX_report.txt", report_type="bismark", name=save_name) >>> >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> metagene = Metagene.from_binom(save_name, genome, up_windows=500, body_windows=1000, down_windows=500) """ reader = UniversalReader(file, "binom", methylation_pvalue=p_value, use_threads=use_threads) args = validate_metagene_args(genome, up_windows, body_windows, down_windows, "wmean") report_df = read_metagene(**(locals() | args)) return cls(report_df, upstream_windows=args["upstream_windows"], gene_windows=args["body_windows"], downstream_windows=args["downstream_windows"])
[docs] @classmethod def from_bedGraph( cls, file: str | Path, genome: pl.DataFrame, fasta: str | Path, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, sumfunc: AvailableSumfunc = "wmean", block_size_mb: int = 30, use_threads: bool = True, save_preprocessed: bool = False, temp_dir: str = Path.cwd() ): """ Constructor for Metagene class from ``.bedGraph`` file. Parameters ---------- file Path to ``.bedGraph`` file. genome ``polars.Dataframe`` with gene ranges (from :class:`Genome`) fasta Path to FASTA genome sequence file or path to preprocessed with :class:`Sequence` cytosine file. up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into sumfunc Summary function to calculate density for window with. temp_dir Directory for temporary files. save_preprocessed Does preprocessed sequence file need to be saved block_size_mb Block size for reading. (Block size ≠ amount of RAM used. Reader allocates approx. Block size * 20 memory for reading.) use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` Returns ------- Metagene Examples -------- >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/report.bedGraph' >>> fasta = 'path/to/sequence.fa' >>> metagene = Metagene.from_bedGraph(path, genome, fasta, up_windows=500, body_windows=1000, down_windows=500) """ with CytosinesFileCM(fasta) as cm: cytosine_file = cm.cytosine_path if not cm.is_cytosine: SequenceFile(fasta).preprocess_cytosines(cytosine_file) reader = UniversalReader(file, report_type="bedgraph", use_threads=use_threads, cytosine_file=cytosine_file, block_size_mb=block_size_mb) args = validate_metagene_args(genome, up_windows, body_windows, down_windows, sumfunc) report_df = read_metagene(reader, genome, up_windows, body_windows, down_windows, sumfunc) return cls(report_df, upstream_windows=args["upstream_windows"], gene_windows=args["body_windows"], downstream_windows=args["downstream_windows"])
[docs] @classmethod def from_coverage( cls, file: str | Path, genome: pl.DataFrame, fasta: str | Path, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, sumfunc: AvailableSumfunc = "wmean", block_size_mb: int = 30, use_threads: bool = True, save_preprocessed: bool = False, temp_dir: str = "./" ): """ Constructor for Metagene class from ``.cov`` file. Parameters ---------- file Path to ``.cov`` file. genome ``polars.Dataframe`` with gene ranges (from :class:`Genome`) fasta Path to FASTA genome sequence file or path to preprocessed with :class:`Sequence` cytosine file. up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into sumfunc Summary function to calculate density for window with. temp_dir Directory for temporary files. save_preprocessed Does preprocessed sequence file need to be saved block_size_mb Block size for reading. (Block size ≠ amount of RAM used. Reader allocates approx. Block size * 20 memory for reading.) use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` Returns ------- Metagene Examples -------- >>> path = 'path/to/report.cov' >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> fasta = 'path/to/sequence.fa' >>> metagene = Metagene.from_coverage(path, genome, fasta, up_windows=500, body_windows=1000, down_windows=500) """ with CytosinesFileCM(fasta) as cm: cytosine_file = cm.cytosine_path if not cm.is_cytosine: SequenceFile(fasta).preprocess_cytosines(cytosine_file) reader = UniversalReader(file, report_type="bedgraph", use_threads=use_threads, cytosine_file=cytosine_file, block_size_mb=block_size_mb) args = validate_metagene_args(genome, up_windows, body_windows, down_windows, sumfunc) report_df = read_metagene(reader, genome, up_windows, body_windows, down_windows, sumfunc) return cls(report_df, upstream_windows=args["upstream_windows"], gene_windows=args["body_windows"], downstream_windows=args["downstream_windows"])
# Todo Check and update
[docs] def filter( self, context: Literal["CG", "CHG", "CHH", None] = None, strand: Literal["+", "-", None] = None, chr: str = None, genome: pl.DataFrame = None, id: list[str] = None, coords: list[str] = None ) -> Metagene: """ Method for filtering metagene. Parameters ---------- context Methylation context (CG, CHG, CHH) to filter (only one). strand Strand to filter (+ or -). chr Chromosome name to filter. genome DataFrame with annotation to filter with (from :class:`Genome`) Returns ------- Filtered :class:`Metagene`. Examples -------- >>> metagene shape: (3, 9) ┌─────────────┬────────┬───────┬────────────────┬─────────┬───────────────┬──────────┬─────┬───────┐ │ chr ┆ strand ┆ start ┆ gene ┆ context ┆ id ┆ fragment ┆ sum ┆ count │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ cat ┆ cat ┆ u64 ┆ cat ┆ cat ┆ cat ┆ u32 ┆ f32 ┆ u32 │ ╞═════════════╪════════╪═══════╪════════════════╪═════════╪═══════════════╪══════════╪═════╪═══════╡ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CG ┆ gene-AT1G0101 ┆ 4 ┆ 0.0 ┆ 1 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CHH ┆ gene-AT1G0101 ┆ 19 ┆ 0.0 ┆ 2 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CHH ┆ gene-AT1G0101 ┆ 20 ┆ 0.0 ┆ 1 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ └─────────────┴────────┴───────┴────────────────┴─────────┴───────────────┴──────────┴─────┴───────┘ >>> metagene.filter(context = "CG", strand = "+") shape: (3, 9) ┌─────────────┬────────┬───────┬────────────────┬─────────┬───────────────┬──────────┬─────┬───────┐ │ chr ┆ strand ┆ start ┆ gene ┆ context ┆ id ┆ fragment ┆ sum ┆ count │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ cat ┆ cat ┆ u64 ┆ cat ┆ cat ┆ cat ┆ u32 ┆ f32 ┆ u32 │ ╞═════════════╪════════╪═══════╪════════════════╪═════════╪═══════════════╪══════════╪═════╪═══════╡ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CG ┆ gene-AT1G0101 ┆ 4 ┆ 0.0 ┆ 1 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CG ┆ gene-AT1G0101 ┆ 166 ┆ 0.0 ┆ 1 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ │ NC_003070.9 ┆ + ┆ 3631 ┆ NC_003070.9:36 ┆ CG ┆ gene-AT1G0101 ┆ 264 ┆ 0.0 ┆ 1 │ │ ┆ ┆ ┆ 31-5899 ┆ ┆ 0 ┆ ┆ ┆ │ └─────────────┴────────┴───────┴────────────────┴─────────┴───────────────┴──────────┴─────┴───────┘ """ context_filter = self.report_df["context"] == context if context is not None else True strand_filter = self.report_df["strand"] == strand if strand is not None else True chr_filter = self.report_df["chr"] == chr if chr is not None else True metadata = self.metadata metadata["context"] = context metadata["strand"] = strand if genome is not None: def genome_filter(df: pl.DataFrame): cast_dtypes = {"chr": pl.Utf8, "strand": pl.Utf8} cast_cat_dtypes = {"chr": pl.Categorical, "strand": pl.Categorical} return df.cast(cast_dtypes).join(genome.cast(cast_dtypes).select(["chr", "start"]), on=["chr", "start"]).cast(cast_cat_dtypes) else: genome_filter = lambda df: df if id is not None: def id_filter(df: pl.DataFrame): return df.filter(pl.col("id").cast(pl.String).is_in(id)) else: id_filter = lambda df: df if coords is not None: def coords_filter(df: pl.DataFrame): return df.filter(pl.col("gene").is_in(coords)) else: coords_filter = lambda df: df if context_filter is None and strand_filter is None and chr_filter is None: return self else: return self.__class__( coords_filter(id_filter(genome_filter(self.report_df.filter(context_filter & strand_filter & chr_filter)))), **metadata)
[docs] def resize(self, to_fragments: int = None) -> Metagene: """ Mutate DataFrame to fewer fragments. Parameters ---------- to_fragments Number of TOTAL (including flanking regions) fragments per gene. Returns ------- Resized :class:`Metagene`. Examples -------- >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/bismark.CX_report.txt' >>> metagene = Metagene.from_bismark(path, genome, up_windows=500, body_windows=1000, down_windows=500) >>> metagene Metagene with 20223666 windows total. Filtered by None context and None strand. Upstream windows: 500. Body windows: 1000. Downstream windows: 500. >>> metagene.resize(100) Metagene with 4946800 windows total. Filtered by None context and None strand. Upstream windows: 25. Body windows: 50. Downstream windows: 25. """ if self.upstream_windows is not None and self.gene_windows is not None and self.downstream_windows is not None: from_fragments = self.total_windows else: from_fragments = self.report_df["fragment"].max() + 1 if to_fragments is None or from_fragments <= to_fragments: return self resized = ( self.report_df.lazy() .with_columns( ((pl.col("fragment") / from_fragments) * to_fragments).floor() .cast(MetageneSchema.fragment) ) .group_by( by=['chr', 'strand', 'start', 'gene', 'id', 'context', 'fragment'] ).agg([ pl.sum('sum').alias('sum'), pl.sum('count').alias('count') ]) ).collect() metadata = self.metadata metadata["upstream_windows"] = metadata["upstream_windows"] // (from_fragments // to_fragments) metadata["downstream_windows"] = metadata["downstream_windows"] // (from_fragments // to_fragments) metadata["gene_windows"] = metadata["gene_windows"] // (from_fragments // to_fragments) return self.__class__(resized, **metadata)
[docs] def trim_flank(self, upstream=True, downstream=True) -> Metagene: """ Trim Metagene flanking regions. Parameters ---------- upstream Trim upstream region? downstream Trim downstream region? Returns ------- Trimmed :class:`Metagene` Examples -------- >>> metagene Metagene with 20223666 windows total. Filtered by None context and None strand. Upstream windows: 500. Body windows: 1000. Downstream windows: 500. >>> metagene.trim_flank() Metagene with 7750085 windows total. Filtered by None context and None strand. Upstream windows: 0. Body windows: 1000. Downstream windows: 0. Or if you do not need to trim one of flanking regions (e.g. upstream). >>> metagene.trim_flank(upstream=False) Metagene with 16288550 windows total. Filtered by None context and None strand. Upstream windows: 500. Body windows: 1000. Downstream windows: 0. """ trimmed = self.report_df.lazy() metadata = self.metadata.copy() if downstream: trimmed = ( trimmed .filter(pl.col("fragment") < self.upstream_windows + self.gene_windows) ) metadata["downstream_windows"] = 0 if upstream: trimmed = ( trimmed .filter(pl.col("fragment") > self.upstream_windows - 1) .with_columns(pl.col("fragment") - self.upstream_windows) ) metadata["upstream_windows"] = 0 return self.__class__(trimmed.collect(), **metadata)
# TODO finish annotation
[docs] def cluster( self, count_threshold: int = 5, na_rm: float | None = None) -> ClusterSingle: """ Cluster regions with hierarchical clustering, by their methylation pattern. Parameters ---------- na_rm Replace empty windows by specified value. count_threshold Minimum counts per window Returns ------- :class:`ClusterSingle` See Also ------- ClusterSingle : For possible analysis options """ return ClusterSingle(self, count_threshold, na_rm)
@staticmethod def _reverse_strand(df, max_fragment): return ( df .filter(pl.col("strand") == "-") .with_columns((max_fragment - pl.col("fragment")).alias("fragment")) .sort("fragment") ) def line_plot_data( self, resolution: int = None, smooth: int = 50, confidence: float = 0., stat: str = "wmean", merge_strands: bool = True, label="" ): """ Parameters ---------- confidence Probability for confidence bands (e.g. 0.95) smooth Number of windows for `SavGol <https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html>`_ filter (set 0 for no smoothing). Applied only if `flank_windows` and `body_windows` params are specified. resolution Number of fragments to resize to. Keep None if no resize is needed. stat Summary function to use for plot. Possible options: ``mean``, ``wmean``, ``log``, ``wlog``, ``qN`` merge_strands Does negative strand need to be reversed label Label for the data. Returns ------- :class:`LinePlotData` """ resized = self if resolution is None else self.resize(resolution) df = resized.report_df # Merge strands if merge_strands: df = df.filter(pl.col("strand") != "-").extend(self._reverse_strand(df, df["fragment"].max())) # Apply stats expr res = ( df .group_by("fragment") .agg([ plot_stat_expr(stat).alias("density"), pl.col("sum"), pl.col("count"), pl.sum("count").alias("n"), (pl.col("sum") / pl.col("count")).mean().alias("average"), (pl.col("sum") - (pl.col("sum") / pl.col("count"))).mean().pow(2).alias("variance") ]) .sort("fragment") ) if 0 < confidence < 1 and stat in ["mean", "wmean"]: res = ( res .with_columns( (pl.col("variance") / pl.col("n")).sqrt().alias("scale") ) .with_columns( pl.struct(["n", "average", "scale"]) .map_elements( lambda field: stats.t.interval(confidence, df=field["n"] - 1, loc=field["average"], scale=field["scale"]), return_dtype=pl.List(pl.Float64) ).alias("interval") ) .with_columns(pl.col("interval").list.to_struct(fields=["lower", "upper"])) .unnest("interval") ) elif 0 < confidence < 1 and not (stat in ["mean", "wmean"]): raise ValueError("Confidence bands available only for mean and wmean stat parameters.") # Fill empty windows template = pl.DataFrame( dict(fragment=list(range(self.total_windows))), schema=dict(fragment=res.schema["fragment"]) ) joined = template.join(res, on="fragment", how="left") # Calculate CI lower = None upper = None if 0 < confidence < 1 and stat in ["mean", "wmean"]: upper = savgol_line(joined["upper"].to_numpy(), smooth) * 100 lower = savgol_line(joined["lower"].to_numpy(), smooth) * 100 # Smooth and convert to percents y = savgol_line(joined["density"].to_numpy(), smooth) * 100 x = np.arange(len(y)) return LinePlotData(x, y, resized._x_ticks, resized._borders, lower, upper, label, ["Upstream", "", "Body", "", "Downstream"])
[docs] def line_plot( self, resolution: int = None, stat="wmean", smooth: int = 50, confidence: float = 0., merge_strands: bool = True ) -> LinePlot: """ Create :class:`LinePlot` method. Parameters ---------- confidence Probability for confidence bands (e.g. 0.95) smooth Number of windows for `SavGol <https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html>`_ filter (set 0 for no smoothing). Applied only if `flank_windows` and `body_windows` params are specified. resolution Number of fragments to resize to. Keep None if no resize is needed. stat Summary function to use for plot. Possible options: ``mean``, ``wmean``, ``log``, ``wlog``, ``qN`` merge_strands Does negative strand need to be reversed Returns ------- :class:`LinePlot` Notes ----- - ``mean`` – Default mean between bins, when count of cytosine residues in bin IS NOT taken into account - ``wmean`` – Weighted mean between bins. Cytosine residues in bin IS taken into account - ``log`` – NOT weighted geometric mean. - ``wlog`` - Weighted geometric mean. - ``qN`` – Return quantile by ``N`` percent (e.g. "``q50``") See Also -------- :doc:`LinePlot example<../../markdowns/test>` Examples -------- Firstly we need to initialize Metagene class >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/bismark.CX_report.txt' >>> metagene = Metagene.from_bismark(path, genome, up_windows=500, body_windows=1000, down_windows=500) Next we can optionally filter metagene by context and strand. >>> filtered = metagene.filter(context = "CG", strand = "-") And LinePlot can be created >>> lp = filtered.line_plot() >>> figure = lp.draw_mpl() >>> figure.show() .. image:: ../../images/lineplot/lp_ara_mpl.png No filtering is suitable too. Then LinePlot will visualize all methylation contexts. >>> lp = metagene.line_plot() >>> figure = lp.draw_mpl() >>> figure.show() .. image:: ../../images/lineplot/ara_multi_mpl.png You can use Plotly version for all plots as well. >>> figure = lp.draw_plotly() >>> figure.show() .. image:: ../../images/lineplot/ara_multi_plotly.png See Also -------- LinePlot : For more information about plottling parameters. """ resized = self.resize(resolution) lp_data = resized.line_plot_data(resolution, smooth, confidence, stat, merge_strands) return LinePlot(lp_data)
[docs] def contexts_line_plot( self, resolution: int = None, stat="wmean", smooth: int = 50, confidence: float = 0., merge_strands: bool = True ) -> LinePlot: resized = self.resize(resolution) lp_data = [ filtered.line_plot_data(resolution, smooth, confidence, stat, merge_strands, context) for context, filtered in zip(CONTEXTS, map(lambda context: resized.filter(context=context), CONTEXTS)) if len(filtered) > 0 ] return LinePlot(lp_data)
def heat_map_data( self, nrow: int = 100, ncol: int = 100, label=None ): resized = self.resize(ncol) report_df = resized.report_df # Merge strands report_df = report_df.filter(pl.col("strand") != "-").extend(self._reverse_strand(report_df, report_df["fragment"].max())) # sort by rows and add row numbers hm_data = ( report_df.lazy() .group_by("gene") .agg([ pl.col("fragment"), (pl.col("sum") / pl.col("count")).alias("density") ]) .with_columns( (pl.col("density").list.sum() / (resized.total_windows + 1)).alias("order") ) .sort('order', descending=True) .with_row_count(name='row') .with_columns( (pl.col('row') / (pl.max("row") + 1) * nrow).floor().alias('row').cast(pl.UInt16) ) .explode(['fragment', 'density']) .group_by(["row", "fragment"]) .agg(pl.mean("density")) ) # prepare full template template = ( pl.LazyFrame(data={"row": list(range(nrow))}) .with_columns(pl.lit(list(range(0, resized.total_windows))).alias("fragment")) .explode("fragment") .with_columns([ pl.col("fragment").cast(MetageneSchema.fragment), pl.col("row").cast(pl.UInt16) ]) ) # join template with actual data hm_data = ( # template join with orig template.join(hm_data, on=['row', 'fragment'], how='left') .fill_null(0) .sort(['row', 'fragment']) .group_by('row', maintain_order=True) .agg(pl.col('density')) .collect() ['density'] .to_list() ) # convert to matrix and percents hm_data = np.array(hm_data, dtype=np.float32) * 100 return HeatMapData(hm_data, resized._x_ticks, resized._borders, label if label is not None else "", ["Upstream", "", "Body", "", "Downstream"])
[docs] def heat_map( self, nrow: int = 100, ncol: int = 100, ) -> HeatMap: """ Create :class:`HeatMap` method. Parameters ---------- nrow Number of fragments to resize to. Keep None if no resize is needed. ncol Number of columns in the resulting heat-map. stat Summary function to use for plot. Possible options: ``mean``, ``wmean``, ``log``, ``wlog``, ``qN`` Returns ------- :class:`HeatMap` Examples -------- Firstly we need to initialize Metagene class >>> genome = Genome.from_gff("path/to/genome.gff").gene_body() >>> >>> path = 'path/to/bismark.CX_report.txt' >>> metagene = Metagene.from_bismark(path, genome, up_windows=500, body_windows=1000, down_windows=500) Next we need to (in contrast with :meth:`Metagene.line_plot`) filter metagene by context and strand. >>> filtered = metagene.filter(context = "CG", strand = "-") And HeatMap can be created. Let it's width be 200 columns and height – 100 rows. >>> hm = filtered.heat_map(nrow=100, ncol=200) >>> figure = hm.draw_mpl() >>> figure.show() .. image:: ../../images/heatmap/ara_mpl.png Plotly version is also available: >>> figure = hm.draw_plotly() >>> figure.show() .. image:: ../../images/heatmap/ara_plotly.png See Also -------- Metagene.line_plot : For more information about `stat` parameter. HeatMap : For more information about plotting parameters. """ hm_data = self.heat_map_data(nrow, ncol) return HeatMap(hm_data)
def __str__(self): representation = (f'Metagene with {len(self)} windows total.\n' f'Filtered by {self.context} context and {self.strand} strand.\n' f'Upstream windows: {self.upstream_windows}.\n' f'Body windows: {self.gene_windows}.\n' f'Downstream windows: {self.downstream_windows}.\n') return representation def __repr__(self): return self.__str__() def box_plot_data(self, filter_context: Literal["CG", "CHG", "CHH"] | None = None, label=""): df = self.filter(context=filter_context).report_df if filter_context is not None else self.report_df if not df.is_empty(): data = ( df .group_by(["chr", "start"]) .agg([ pl.first("gene").alias("locus"), pl.first("strand"), pl.first("id"), (pl.sum("sum") / pl.sum("count")).alias("density") ]) ) return BoxPlotData( data["density"].to_list(), label, data["locus"].to_list(), data["id"].to_list() ) else: return BoxPlotData.empty(label)
[docs] def context_box_plot(self): data = [self.box_plot_data(context, context) for context in CONTEXTS] return BoxPlot(data)
[docs]class MetageneFiles(MetageneFilesBase): """ Stores and plots multiple data for :class:`Metagene`. If you want to compare Bismark data with different genomes, create this class with a list of :class:`Bismark` classes. """
[docs] @classmethod def from_list( cls, filenames: list[str | Path], genomes: pl.DataFrame | list[pl.DataFrame], labels: list[str] = None, up_windows: int = 0, body_windows: int = 2000, down_windows: int = 0, report_type: ReportTypes = "bismark", block_size_mb: int = 50, use_threads: bool = True, sumfunc: AvailableSumfunc = "wmean", **kwargs ) -> MetageneFiles: """ Create istance of :class:`MetageneFiles` from list of paths. Parameters ---------- filenames List of filenames to read from genomes Annotation DataFrame or list of Annotations (may be different annotations) labels Labels for plots for Metagenes up_windows Number of windows upstream region to split into body_windows Number of windows body region to split into down_windows Number of windows downstream region to split into report_type Type of input report. Possible options: ``bismark``, ``cgmap``, ``binom``, ``bedgraph``, ``coverage``. block_size_mb Block size for reading. (Block size ≠ amount of RAM used. Reader allocates approx. Block size * 20 memory for reading.) use_threads Do multi-threaded or single-threaded reading. If multi-threaded option is used, number of threads is defined by `multiprocessing.cpu_count()` sumfunc Summary function to calculate density for window with. Returns ------- MetageneFiles Instance of :class:`MetageneFiles` See Also ________ Metagene Examples -------- Initialization using :meth:`MetageneFiles.from_list` >>> ara_genome = Genome.from_gff("/path/to/arath.gff").gene_body(min_length=2000) >>> brd_genome = Genome.from_gff("/path/to/bradi.gff").gene_body(min_length=2000) >>> >>> metagenes = MetageneFiles.from_list( >>> ["path/to/bradi.txt", "path/to/ara.txt"], >>> [brd_genome, ara_genome], >>> ["BraDi", "AraTh"], >>> report_type="bismark", >>> up_windows=250, body_windows=500, down_windows=250 >>> ) :class:`MetageneFiles` can be initialized explicitly: >>> metagene_ara = Metagene.from_bismark("path/to/ara.txt", ara_genome, up_windows=250, body_windows=500, down_windows=250) >>> metagene_brd = Metagene.from_bismark("path/to/ara.txt", ara_genome, up_windows=250, body_windows=500, down_windows=250) >>> metagenes = MetageneFiles(samples=[metagene_brd, metagene_ara], labels=["BraDi", "AraTh"]) The resulting objects will be identical. Warnings -------- When :class:`MetageneFiles` is initialized explicitly, number of windows needs ot be the same in evety sample """ read_fnc = { "bismark": Metagene.from_bismark, "binom": Metagene.from_binom, "cgmap": Metagene.from_cgmap, "bedgraph": Metagene.from_bedGraph, "coverage": Metagene.from_coverage } if not isinstance(genomes, list): genomes = [genomes] * len(filenames) else: if len(genomes) != len(filenames): raise AttributeError("Number of genomes and filenames provided does not match") default_args = dict( up_windows=up_windows, body_windows=body_windows, down_windows=down_windows, use_threads=use_threads ) if report_type not in ["binom"]: default_args |= dict( block_size_mb=block_size_mb, sumfunc=sumfunc ) samples: list[Metagene] = [] for file, genome in zip(filenames, genomes): sample = read_fnc[report_type](file=file, genome=genome, **default_args, **kwargs) samples.append(sample) return cls(samples, labels)
[docs] def filter(self, context: str = None, strand: str = None, chr: str = None, genome: pl.DataFrame = None, id: list[str] = None, coords: list[str] = None) -> MetageneFiles: """ :meth:`Metagene.filter` all metagenes. Parameters ---------- context Methylation context (CG, CHG, CHH) to filter (only one). strand Strand to filter (+ or -). chr Chromosome name to filter. genome DataFrame with annotation to filter with (e.g. from :class:`Genome`) Returns ------- Filtered :class:`MetageneFiles`. See Also -------- Metagene.filter : For examples. """ return self.__class__([sample.filter(context, strand, chr, genome, id, coords) for sample in self.samples], self.labels)
[docs] def trim_flank(self, upstream=True, downstream=True): """ :meth:`Metagene.trim_flank` all metagenes. Parameters ---------- upstream Trim upstream region? downstream Trim downstream region? Returns ------- Trimmed :class:`Metagene` See Also -------- Metagene.trim_flank : For examples. """ return self.__class__([sample.trim_flank(upstream, downstream) for sample in self.samples], self.labels)
[docs] def resize(self, to_fragments: int): """ :meth:`Metagene.resize` all metagenes. Parameters ---------- to_fragments Number of TOTAL (including flanking regions) fragments per gene. Returns ------- Resized :class:`Metagene`. See Also -------- Metagene.resize : For examples. """ return self.__class__([sample.resize(to_fragments) for sample in self.samples], self.labels)
[docs] def merge(self): """ Merge :class:`MetageneFiles` into single :class:`Metagene` Warnings -------- **ALL** metagenes in :class:`MetageneFiles` need to be aligned to the same annotation. Parameters ---------- Returns ------- Instance of :class:`Metagene`, with merged replicates. """ pl.enable_string_cache() metadata = [sample.metadata for sample in self.samples] upstream_windows = set([md.get("upstream_windows") for md in metadata]) gene_windows = set([md.get("gene_windows") for md in metadata]) downstream_windows = set([md.get("downstream_windows") for md in metadata]) if len(upstream_windows) == len(downstream_windows) == len(gene_windows) == 1: merged = ( pl.concat([sample.report_df for sample in self.samples]).lazy() .group_by(["strand", "context", "chr", "gene", "start", "id", "fragment"], maintain_order=True) .agg([pl.sum("sum").alias("sum"), pl.sum("count").alias("count")]) .select(self.samples[0].report_df.columns) ).collect() return Metagene(merged, upstream_windows=list(upstream_windows)[0], downstream_windows=list(downstream_windows)[0], gene_windows=list(gene_windows)[0]) else: raise Exception("Metadata for merge DataFrames does not match!")
[docs] def line_plot( self, resolution: int = None, smooth: int = 50, confidence: float = 0., stat: str = "wmean", merge_strands: bool = True ): """ Create :class:`LinePlotFiles` method. Parameters ---------- resolution Number of fragments to resize to. Keep None if no resize is needed. stat Summary function to use for plot. Possible options: ``mean``, ``wmean``, ``log``, ``wlog``, ``qN`` merge_strands Does negative strand need to be reversed confidence Probability for confidence bands (e.g. 0.95) smooth Number of windows for `SavGol <https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html>`_ filter (set 0 for no smoothing). Applied only if `flank_windows` and `body_windows` params are specified. Returns ------- :class:`LinePlot` See Also -------- Metagene.line_plot : for more information about `stat` parameter. Examples -------- >>> filtered = metagenes.filter("CG") >>> lp = metagenes.line_plot() Matplotlib version: >>> figure = lp.draw_mpl() >>> figure.show() .. image:: ../../images/lineplot/ara_bradi_mpl.png Plotly version >>> figure = lp.draw_plotly() >>> figure.show() Now when we have metagene initlaized .. image:: ../../images/lineplot/ara_bradi_plotly.png """ lp_data = [ sample.line_plot_data(resolution, smooth, confidence, stat, merge_strands, label) for sample, label in zip(self.samples, self.labels) ] return LinePlot(lp_data)
[docs] def heat_map(self, nrow: int = 100, ncol: int = None) -> HeatMap: """ Create :class:`HeatMapNew` method. Parameters ---------- nrow Number of fragments to resize to. Keep None if no resize is needed. ncol Number of columns in the resulting heat-map. stat Summary function to use for plot. Possible options: ``mean``, ``wmean``, ``log``, ``wlog``, ``qN`` Returns ------- :class:`HeatMap` See Also -------- Metagene.line_plot : for more information about `stat` parameter. Metagene.heat_map Examples -------- >>> filtered = metagenes.filter("CG") >>> hm = metagenes.heat_map(nrow=100, ncol=100) Matplotlib version: >>> figure = hm.draw_mpl(major_labels=None) >>> figure.show() .. image:: ../../images/heatmap/ara_bradi_mpl.png Plotly version >>> figure = hm.draw_plotly(major_labels=None) >>> figure.show() Now when we have metagene initlaized .. image:: ../../images/heatmap/ara_bradi_plotly.png """ hm_data = [ sample.heat_map_data(nrow, ncol, label) for sample, label in zip(self.samples, self.labels) ] return HeatMap(hm_data)
def box_plot_data(self, filter_context: Literal["CG", "CHG", "CHH"] | None = None): return [sample.box_plot_data(filter_context, label) for label, sample in zip(self.labels, self.samples)] # Todo add fixes from issue14-16#fix
[docs] def box_plot(self): return BoxPlot(self.box_plot_data())
[docs] def dendrogram(self, q: float = .75, **kwargs): """ Cluster samples by total methylation level of regions and draw dendrogram. Parameters ---------- q Quantile of regions, which will be clustered kwargs Keyword arguments for `seaborn.clustermap <https://seaborn.pydata.org/generated/seaborn.clustermap.html>`_ Returns ------- ``matplotlib.pyplot.Figure`` """ gene_stats = [] for sample, label in zip(self.samples, self.labels): gene_stats.append( sample.report_df .group_by(["chr", "strand", "start", "gene"]) .agg((pl.sum("sum") / pl.sum("count")).alias("density")) .with_columns(pl.lit(label).alias("label")) ) gene_set = set.intersection(*[set(stat["gene"].to_list()) for stat in gene_stats]) if len(gene_set) < 1: raise ValueError("Region set intersection is empty. Are Metagenes read with same genome?") gene_stats = [stat.filter(pl.col("gene").is_in(list(gene_set))) for stat in gene_stats] dendro_data = pl.concat(gene_stats).pivot(values="density", columns="label", index="gene", aggregate_function="mean") matrix = dendro_data.select(pl.all().exclude("gene")).to_pandas() # Filter by variance if q > 0: var = matrix.to_numpy().var(1) matrix = matrix[var > np.quantile(var, q)] fig = sns.clustermap(matrix, **kwargs) return fig
[docs] def cluster(self, count_threshold=5, na_rm: float | None = None): """ Cluster samples regions by methylation pattern. Parameters ---------- count_threshold Minimum number of reads per fragment to include it in analysis. na_rm Value to fill empty fragments. (None if not full regions need to be deleted) Returns ------- ClusterMany """ return ClusterMany(self, count_threshold, na_rm)