Name: Bio Chipseq Qc
Author: huamu668

스킬 검색.../

Bio Chipseq Qc | Skills Pool

pysam

pybedtools

# Count reads in peaks
reads_in_peaks=$(bedtools intersect -a chip.bam -b peaks.narrowPeak -u | samtools view -c -)
total_reads=$(samtools view -c -F 260 chip.bam)

# Calculate FRiP
frip=$(echo "scale=4; $reads_in_peaks / $total_reads" | bc)
echo "FRiP: $frip"

# Convert peaks to SAF format
awk 'BEGIN{OFS="\t"} {print $4, $1, $2, $3, "."}' peaks.narrowPeak > peaks.saf

# Count reads in peaks
featureCounts -a peaks.saf -F SAF -o peak_counts.txt chip.bam

# FRiP from summary
grep -v "^#" peak_counts.txt.summary

import pysam
import pybedtools

def calculate_frip(bam_file, peak_file):
    bam = pysam.AlignmentFile(bam_file, 'rb')
    total_reads = bam.count(read_callback=lambda r: not r.is_unmapped and not r.is_secondary)

    peaks = pybedtools.BedTool(peak_file)
    reads_in_peaks = 0
    for peak in peaks:
        reads_in_peaks += bam.count(peak.chrom, peak.start, peak.end)

    frip = reads_in_peaks / total_reads
    return frip

frip = calculate_frip('chip.bam', 'peaks.narrowPeak')
print(f'FRiP: {frip:.4f}')

# Run SPP cross-correlation analysis
Rscript run_spp.R \
    -c=chip.bam \
    -savp=chip_cc.pdf \
    -out=chip_cc.txt \
    -odir=qc/

# Output columns:
# 1: filename
# 2: numReads
# 3: estFragLen (estimated fragment length)
# 4: corr_estFragLen
# 5: phantomPeak
# 6: corr_phantomPeak
# 7: argmin_corr (minimum strand shift)
# 8: min_corr
# 9: NSC (Normalized Strand Coefficient)
# 10: RSC (Relative Strand Coefficient)
# 11: QualityTag

# Parse results
awk -F'\t' '{
    print "Fragment length:", $3
    print "NSC:", $9
    print "RSC:", $10
    print "Quality:", $11
}' chip_cc.txt

library(spp)

chip_data <- read.bam.tags('chip.bam')
binding_characteristics <- get.binding.characteristics(chip_data, srange=c(50, 500), bin=5)

# Cross-correlation plot
pdf('cc_plot.pdf')
plot(binding_characteristics$cross.correlation, type='l',
     xlab='Strand shift', ylab='Cross-correlation')
abline(v=binding_characteristics$peak$x, col='red')
dev.off()

# Extract metrics
print(paste('Fragment length:', binding_characteristics$peak$x))

# NRF: Non-Redundant Fraction (unique reads / total reads)
total=$(samtools view -c -F 260 chip.bam)
unique=$(samtools view -F 260 chip.bam | cut -f1-4 | sort -u | wc -l)
nrf=$(echo "scale=4; $unique / $total" | bc)
echo "NRF: $nrf"

# PBC1: PCR Bottleneck Coefficient 1 (M1/Mdistinct)
# M1 = locations with exactly 1 read
# Mdistinct = distinct genomic locations

bedtools bamtobed -i chip.bam | \
    awk '{print $1":"$2"-"$3}' | \
    sort | uniq -c | \
    awk '{
        if($1==1) m1++
        mdist++
    } END {
        print "M1:", m1
        print "Mdistinct:", mdist
        print "PBC1:", m1/mdist
    }'

# Call peaks on each replicate
macs3 callpeak -t rep1.bam -c input.bam -n rep1 -g hs
macs3 callpeak -t rep2.bam -c input.bam -n rep2 -g hs

# Sort by signal value (column 7)
sort -k7,7nr rep1_peaks.narrowPeak > rep1_sorted.narrowPeak
sort -k7,7nr rep2_peaks.narrowPeak > rep2_sorted.narrowPeak

# Run IDR
idr --samples rep1_sorted.narrowPeak rep2_sorted.narrowPeak \
    --input-file-type narrowPeak \
    --rank signal.value \
    --output-file idr_output.txt \
    --plot idr_plot.pdf \
    --log-output-file idr.log

# Call peaks on pooled data
samtools merge -f pooled.bam rep1.bam rep2.bam
macs3 callpeak -t pooled.bam -c input.bam -n pooled -g hs

# Run IDR with oracle
idr --samples rep1_sorted.narrowPeak rep2_sorted.narrowPeak \
    --peak-list pooled_peaks.narrowPeak \
    --input-file-type narrowPeak \
    --rank signal.value \
    --output-file idr_oracle.txt

# Count peaks at different IDR thresholds
awk '$5 >= 540' idr_output.txt | wc -l  # IDR < 0.05 (conservative)
awk '$5 >= 415' idr_output.txt | wc -l  # IDR < 0.1 (optimal)

# IDR output columns:
# 1-3: chr, start, end
# 4: name
# 5: scaled IDR (-125 * log2(IDR))
# 6: strand
# 7: signal (from rep1)
# 8: signal (from rep2)
# 9: local IDR
# 10: global IDR

# Split one sample and check self-consistency
samtools view -s 0.5 chip.bam -b > pseudo_rep1.bam
samtools view -s 2.5 chip.bam -b > pseudo_rep2.bam

# Call peaks on pseudo-replicates
macs3 callpeak -t pseudo_rep1.bam -c input.bam -n pseudo1 -g hs
macs3 callpeak -t pseudo_rep2.bam -c input.bam -n pseudo2 -g hs

# Run IDR
idr --samples pseudo1_peaks.narrowPeak pseudo2_peaks.narrowPeak \
    --input-file-type narrowPeak \
    --output-file self_idr.txt

# Assess enrichment with fingerprint plot
plotFingerprint \
    -b chip.bam input.bam \
    --labels ChIP Input \
    -o fingerprint.pdf \
    --outRawCounts fingerprint.tab \
    --outQualityMetrics fingerprint_qc.txt

# Good ChIP shows curve shifted right of diagonal
# Input follows diagonal

# TSS enrichment
computeMatrix reference-point \
    -S chip.bw \
    -R genes.bed \
    --referencePoint TSS \
    -a 3000 -b 3000 \
    -o matrix.gz

plotProfile \
    -m matrix.gz \
    -o tss_enrichment.pdf \
    --perGroup

# Sample correlation
multiBamSummary bins \
    -b rep1.bam rep2.bam rep3.bam \
    -o results.npz

plotCorrelation \
    -in results.npz \
    --corMethod spearman \
    --whatToPlot heatmap \
    -o correlation.pdf \
    --outFileCorMatrix correlation.tab

#!/bin/bash
sample=$1
input=$2
peaks=$3

echo "=== ChIP-seq QC Report: $sample ===" > qc_report.txt

# FRiP
reads_in_peaks=$(bedtools intersect -a $sample -b $peaks -u | samtools view -c -)
total_reads=$(samtools view -c -F 260 $sample)
frip=$(echo "scale=4; $reads_in_peaks / $total_reads" | bc)
echo "FRiP: $frip" >> qc_report.txt

# Cross-correlation
Rscript run_spp.R -c=$sample -out=cc.txt -odir=.
nsc=$(cut -f9 cc.txt)
rsc=$(cut -f10 cc.txt)
echo "NSC: $nsc" >> qc_report.txt
echo "RSC: $rsc" >> qc_report.txt

# Library complexity
unique=$(samtools view -F 260 $sample | cut -f1-4 | sort -u | wc -l)
nrf=$(echo "scale=4; $unique / $total_reads" | bc)
echo "NRF: $nrf" >> qc_report.txt

# Fingerprint
plotFingerprint -b $sample $input -o fingerprint.pdf --outQualityMetrics fingerprint_qc.txt

cat qc_report.txt

Metric	Tool	Ideal Value
FRiP	bedtools/featureCounts	> 0.05 (TF), > 0.1 (histone)
NSC	phantompeakqualtools	> 1.1
RSC	phantompeakqualtools	> 1.0
NRF	samtools/bedtools	> 0.8
PBC1	bedtools	> 0.8
IDR (replicates)	idr	> 70% concordance

Target	Minimum FRiP	Good FRiP
TF (narrow)	0.01	> 0.05
Histone (broad)	0.10	> 0.20
H3K4me3	0.05	> 0.15
H3K27ac	0.05	> 0.10

Metric	Marginal	Acceptable	Ideal
NSC	< 1.05	1.05 - 1.1	> 1.1
RSC	< 0.8	0.8 - 1.0	> 1.0
QualityTag	-2	0	1 or 2

Metric	Severe	Mild	None
NRF	< 0.5	0.5 - 0.8	> 0.8
PBC1	< 0.5	0.5 - 0.8	> 0.8
PBC2	< 1	1 - 3	> 3

Comparison	Expected IDR Peaks	Notes
True replicates	> 70% of pooled	Biological concordance
Pseudo-replicates	> 80% of sample	Technical consistency
Rep vs Pooled	~100% of rep peaks	Subset relationship

Bio Chipseq Qc

Version Compatibility

ChIP-seq Quality Control

Bio Chipseq Qc

Version Compatibility

ChIP-seq Quality Control

FRiP (Fraction of Reads in Peaks)

Calculate FRiP with bedtools

Calculate FRiP with featureCounts

Calculate FRiP with pysam

FRiP Thresholds

Cross-Correlation Analysis (NSC/RSC)

Run phantompeakqualtools

Interpret NSC and RSC

NSC/RSC Thresholds

Plot Cross-Correlation in R

Library Complexity (NRF, PBC1, PBC2)

Calculate with bedtools

Library Complexity Thresholds

IDR (Irreproducibility Discovery Rate)

Run IDR Analysis

IDR with Pooled Peaks

Interpret IDR Results

IDR Self-Consistency Check

IDR Quality Guidelines

deepTools QC Metrics

plotFingerprint

computeMatrix and plotProfile

plotCorrelation

Complete QC Pipeline

QC Summary Table

Automation Audit Ops

Github Qa Labels

Jupyter Notebook

Tidb Integrationtest Recorder

Quality Nonconformance

Hugging Face Trackio

Bio Chipseq Qc

Version Compatibility

ChIP-seq Quality Control

Bio Chipseq Qc

Version Compatibility

ChIP-seq Quality Control

FRiP (Fraction of Reads in Peaks)

Calculate FRiP with bedtools

Calculate FRiP with featureCounts

Calculate FRiP with pysam

FRiP Thresholds

Cross-Correlation Analysis (NSC/RSC)

Run phantompeakqualtools

Interpret NSC and RSC

NSC/RSC Thresholds

Plot Cross-Correlation in R

Library Complexity (NRF, PBC1, PBC2)

Calculate with bedtools

Library Complexity Thresholds

IDR (Irreproducibility Discovery Rate)

Run IDR Analysis

IDR with Pooled Peaks

Interpret IDR Results

IDR Self-Consistency Check

IDR Quality Guidelines

deepTools QC Metrics

plotFingerprint

computeMatrix and plotProfile

plotCorrelation

Complete QC Pipeline

QC Summary Table

Related Skills

Automation Audit Ops

Github Qa Labels

Jupyter Notebook

Tidb Integrationtest Recorder

Quality Nonconformance

Hugging Face Trackio