# IMPORTS #
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import os
import io
import warnings
import re
import glob
import matplotlib.patches as mpatches
import requests
from scipy import stats
from matplotlib.colors import LinearSegmentedColormap

# Directory containing the GWAS files
directory = "5_more_GWAS"

# Function to read the first few lines of a gzipped file to get headers
def get_headers(file_path, n_rows=5):
    try:
        # Try reading with pandas first
        with gzip.open(file_path, 'rt') as f:
            # Read just the first few lines to get column names
            df = pd.read_csv(f, sep=None, engine='python', nrows=n_rows)
            return {
                'file': os.path.basename(file_path),
                'columns': list(df.columns),
                'sample_data': df.iloc[0].to_dict() if len(df) > 0 else {},
                'error': None
            }
    except Exception as e1:
        try:
            # If pandas fails, try a more basic approach
            with gzip.open(file_path, 'rt') as f:
                header = f.readline().strip()
                # Try to split by common delimiters
                for delimiter in ['\t', ',', ' ']:
                    if delimiter in header:
                        columns = header.split(delimiter)
                        # Read one more line to get sample data
                        sample_line = f.readline().strip()
                        sample_data = dict(zip(columns, sample_line.split(delimiter)))
                        return {
                            'file': os.path.basename(file_path),
                            'columns': columns,
                            'sample_data': sample_data,
                            'error': None
                        }
                return {
                    'file': os.path.basename(file_path),
                    'columns': [header],  # Just return the whole line if we can't split
                    'sample_data': {},
                    'error': "Could not determine delimiter"
                }
        except Exception as e2:
            return {
                'file': os.path.basename(file_path),
                'columns': [],
                'sample_data': {},
                'error': f"First attempt: {str(e1)}, Second attempt: {str(e2)}"
            }

# Get all gzipped files in the directory
files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.gz')]

# Process each file
results = []
for file_path in files:
    # print(f"Processing {file_path}...")
    result = get_headers(file_path)
    results.append(result)
    
    # Print the results for this file
    print(f"\nFile: {result['file']}")
    if result['error']:
        print(f"Error: {result['error']}")
    else:
        print("Columns:")
        for i, col in enumerate(result['columns']):
            print(f"  {i+1}. {col}")
        
        # print("\nSample data (first row):")
        # for col, value in result['sample_data'].items():
        #     print(f"  {col}: {value}")
    
    # print("\n" + "="*80 + "\n")

# Create a summary dataframe
summary = pd.DataFrame([{
    'File': r['file'],
    'Number of Columns': len(r['columns']),
    'Columns': ', '.join(r['columns'][:5]) + ('...' if len(r['columns']) > 5 else ''),
    'Error': r['error'] or 'None'
} for r in results])

print("Summary of all files:")
display(summary)

File: Household_Income_UKBiobank.txt.gz
Columns:
  1. Chr
  2. SNP
  3. BPos
  4. Non_effect_Allele
  5. Effect_Allele
  6. Beta
  7. Standard_Error_of_Beta
  8. P

File: AgeFirstBirth_Female.txt.gz
Columns:
  1. SNPID
  2. CHR
  3. POS
  4. A1
  5. A2
  6. Freq_HapMap
  7. Zscore
  8. Pvalue

File: AgeFirstBirth_GCST90000048_buildGRCh37.tsv.gz
Columns:
  1. variant_id
  2. effect_allele
  3. other_allele
  4. beta
  5. standard_error
  6. p_value
  7. chromosome
  8. base_pair_location

File: CUD_EUR_casecontrol_public_11.14.2020.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. Beta
  7. SE
  8. P
  9. N
  10. N_CAS
  11. N_CON

File: pts_eur_freeze2_overall.results.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. FRQ_A_23212
  7. FRQ_U_151447
  8. INFO
  9. OR
  10. SE
  11. P
  12. ngt
  13. Direction
  14. HetISqt
  15. HetDf
  16. HetPVa
  17. Nca
  18. Nco
  19. Neff

File: ADHD2022_iPSYCH_deCODE_PGC.meta.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. FRQ_A_38691
  7. FRQ_U_186843
  8. INFO
  9. OR
  10. SE
  11. P
  12. Direction
  13. Nca
  14. Nco

File: Computer.all.gz
Columns:
  1. uniqid
  2. SNP
  3. CHR
  4. BP
  5. GENPOS
  6. ALLELE1
  7. ALLELE0
  8. A1FREQ
  9. INFO
  10. BETA
  11. SE
  12. P_BOLT_LMM_INF

File: chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz
Columns:
  1. SNP
  2. CHR
  3. BP
  4. ALLELE1
  5. ALLELE0
  6. A1FREQ
  7. INFO
  8. BETA
  9. SE
  10. P_BOLT_LMM
  11. HWE_P
Summary of all files:

Reading full datasets...
Processing PGC3_SCZ_wave3.primary.autosome.public.v3.vcf.tsv.gz in chunks...
Processed 7585077 variants, kept 1077992 after filtering.
Processing PGC3_SCZ_wave3.primary.chrX.public.v3.vcf.tsv.gz in chunks...
Processed 3877 variants, kept 3877 after filtering.

Columns in autosomal data: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NGT, DIRE, NCAS, NCON, NEFFDIV2
Columns in X chromosome data: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NCAS, NCON, NEFFDIV2
Common columns: CHROM, NEFFDIV2, FCAS, NCON, SE, BETA, A1, NCAS, POS, A2, PVAL, FCON, ID, IMPINFO

Combined dataset contains 1081869 variants

Summary of P-values:
count    1.081869e+06
mean     1.667633e-02
std      1.532406e-02
min      1.205000e-39
25%      2.390000e-03
50%      1.243000e-02
75%      2.886000e-02
max      5.000000e-02
Name: PVAL, dtype: float64

Number of genome-wide significant variants (p < 5e-08): 21790

Generating Manhattan plot...

Downsampling from 1081869 to 100000 points for QQ plot

Genomic inflation factor (λ): 13.735

Filtered data saved to 'filtered_scz_gwas.csv' for further analysis

Reading sample data to examine structure...
Skipping 73 comment lines
Read 5 rows from pgc3_scz_official.tsv.gz
Columns: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NCAS, NCON, NEFF

Sample data:

Processing pgc3_scz_official.tsv.gz in chunks...
Processed 7659767 variants so far...
Processed 7659767 variants, kept 1038605 after filtering.

PGC3 data structure:
Number of variants: 1038605
Columns: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NCAS, NCON, NEFF

Using columns: chr=CHROM, pos=POS, p-value=PVAL, SNP=ID

# Code to download and use official PGC3 summary statistics

# Define URL for PGC3 schizophrenia summary statistics
pgc3_url = "https://figshare.com/ndownloader/files/34517828"  
print("Downloading official PGC3 schizophrenia summary statistics...")
try:
    response = requests.get(pgc3_url)
    if response.status_code == 200:
        # Process the downloaded file
        with open("pgc3_scz_official.tsv.gz", "wb") as f:
            f.write(response.content)
        
        # Read the official summary statistics
        official_sumstats = process_pgc_data("pgc3_scz_official.tsv.gz", p_threshold=0.05)
        
        # Create Manhattan and QQ plots with official data
        # (Adapt column names as needed based on the official format)
        
        print("Successfully used official PGC3 summary statistics.")
    else:
        print(f"Failed to download: Status code {response.status_code}")
except Exception as e:
    print(f"Error downloading official summary statistics: {e}")
    print("Proceeding with local genomic control correction instead.")

Downloading official PGC3 schizophrenia summary statistics...
Processing pgc3_scz_official.tsv.gz in chunks...
Processed 7659767 variants, kept 1038605 after filtering.
Successfully used official PGC3 summary statistics.

# Set plotting style
plt.style.use('seaborn-whitegrid')
sns.set_context("talk")

# Function to read PGC VCF format files
def read_pgc_vcf(file_path, nrows=None):
    """Read PGC VCF format files, properly handling the comment header"""
    # Count header lines that start with ##
    header_lines = 0
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            if line.startswith('##'):
                header_lines += 1
            else:
                break
    
    print(f"Skipping {header_lines} comment lines")
    
    # Read the data skipping header lines
    df = pd.read_csv(file_path, sep='\t', skiprows=header_lines, nrows=nrows, compression='gzip')
    
    # Rename CHROM column if it has a # prefix
    if '#CHROM' in df.columns:
        df = df.rename(columns={'#CHROM': 'CHROM'})
    
    print(f"Read {len(df)} rows from {file_path}")
    print(f"Columns: {', '.join(df.columns)}")
    
    return df

# Path to our downloaded PGC3 summary statistics
pgc3_file = "pgc3_scz_official.tsv.gz"  # Adjust this to our actual filename

# Read a small sample to examine structure
print("Reading sample data to examine structure...")
pgc3_sample = read_pgc_vcf(pgc3_file, nrows=5)
print("\nSample data:")
display(pgc3_sample)

# Process data in chunks to handle large files
def process_pgc_vcf(file_path, chunk_size=100000, p_threshold=0.05):
    """Process PGC VCF format files in chunks"""
    # Skip header lines
    header_lines = 0
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            if line.startswith('##'):
                header_lines += 1
            else:
                break
    
    # Process in chunks
    chunks = []
    total_rows = 0
    
    print(f"Processing {file_path} in chunks...")
    
    try:
        for chunk in pd.read_csv(file_path, sep='\t', skiprows=header_lines, 
                               chunksize=chunk_size, compression='gzip', low_memory=False):
            # Rename CHROM column if it has a # prefix
            if '#CHROM' in chunk.columns:
                chunk = chunk.rename(columns={'#CHROM': 'CHROM'})
            
            total_rows += len(chunk)
            
            # Determine p-value column
            p_col = None
            for possible_p in ['PVAL', 'P', 'p', 'p.value', 'p_value', 'pvalue']:
                if possible_p in chunk.columns:
                    p_col = possible_p
                    break
            
            # If we find INFO column, it might contain LP (log10 p-value)
            if p_col is None and 'INFO' in chunk.columns:
                # Extract LP (log10 p-value) from INFO field if present
                if chunk['INFO'].str.contains('LP=').any():
                    chunk['LP'] = chunk['INFO'].str.extract(r'LP=([^;]+)')
                    chunk['LP'] = pd.to_numeric(chunk['LP'], errors='coerce')
                    
                    # Calculate P-value from LP (log10 p-value)
                    chunk['PVAL'] = 10 ** (-chunk['LP'])
                    p_col = 'PVAL'
            
            # Filter by P-value if needed and if p-value column was found
            if p_col is not None and p_threshold < 1:
                chunk = chunk[chunk[p_col].astype(float) <= p_threshold]
            
            if not chunk.empty:
                chunks.append(chunk)
                
            print(f"Processed {total_rows} variants so far...", end='\r')
    
    except Exception as e:
        print(f"\nError during processing: {e}")
        if chunks:
            print(f"Will continue with {len(chunks)} successfully processed chunks")
        else:
            print("No data was successfully processed")
            return pd.DataFrame()
    
    # Combine chunks
    if chunks:
        result = pd.concat(chunks, ignore_index=True)
        print(f"\nProcessed {total_rows} variants, kept {len(result)} after filtering.")
        return result
    else:
        print(f"\nNo variants met filtering criteria from {total_rows} total variants.")
        return pd.DataFrame()

# Process the PGC3 data
pgc3_data = process_pgc_vcf(pgc3_file, p_threshold=0.05)

# If data was successfully loaded, continue with analysis
if not pgc3_data.empty:
    # Check the data structure
    print("\nPGC3 data structure:")
    print(f"Number of variants: {len(pgc3_data)}")
    print(f"Columns: {', '.join(pgc3_data.columns)}")
    
    # Identify column names for plotting
    # Common column names in GWAS summary statistics
    chr_col = None
    for c in ['CHROM', 'CHR', 'chromosome', '#CHROM', 'chr']:
        if c in pgc3_data.columns:
            chr_col = c
            break
    
    pos_col = None
    for c in ['POS', 'BP', 'position', 'pos', 'base_pair_location']:
        if c in pgc3_data.columns:
            pos_col = c
            break
    
    pval_col = None
    for c in ['PVAL', 'P', 'p', 'pvalue', 'p_value', 'p.value']:
        if c in pgc3_data.columns:
            pval_col = c
            break
    
    snp_col = None
    for c in ['SNP', 'ID', 'rsID', 'rsid', 'variant_id', 'MarkerName']:
        if c in pgc3_data.columns:
            snp_col = c
            break
    
    print(f"\nUsing columns: chr={chr_col}, pos={pos_col}, p-value={pval_col}, SNP={snp_col}")
    
# Create Manhattan and QQ plot functions (reusing from before)
def create_manhattan_plot(df, p_col, chr_col, bp_col, snp_col=None, 
                        suggestive_line=5e-8, title="Manhattan Plot - PGC3 Schizophrenia GWAS"):
    """Create a Manhattan plot from GWAS summary statistics"""
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Convert p-values to float
    df[p_col] = pd.to_numeric(df[p_col], errors='coerce')
    
    # Remove rows with missing p-values
    df = df.dropna(subset=[p_col])
    
    # Convert chromosome to numeric
    df[chr_col] = df[chr_col].astype(str)
    df[chr_col] = df[chr_col].str.replace('chr', '').str.replace('X', '23').str.replace('Y', '24').str.replace('MT', '25')
    df[chr_col] = pd.to_numeric(df[chr_col], errors='coerce')
    
    # Remove rows with missing chromosome or position
    df = df.dropna(subset=[chr_col, bp_col])
    
    # Calculate -log10(p) values
    df['log_p'] = -np.log10(df[p_col])
    
    # Sort by chromosome and position
    df = df.sort_values([chr_col, bp_col])
    
    # Calculate cumulative position
    df['cum_pos'] = 0
    cum_pos = 0
    chr_pos = {}
    chr_colors = {}
    colors = ['#4878D0', '#EE854A']  # Alternating colors
    
    for i, (chrom, group) in enumerate(df.groupby(chr_col)):
        if pd.isna(chrom):
            continue  # Skip NaN chromosomes
        
        group_pos = group[bp_col].astype(float)
        max_pos = group_pos.max() if not group_pos.empty else 0
        
        chr_pos[chrom] = cum_pos + max_pos / 2
        df.loc[group.index, 'cum_pos'] = group[bp_col].astype(float) + cum_pos
        cum_pos += max_pos + 5000000  # 5Mb spacing
        chr_colors[chrom] = colors[i % len(colors)]
    
    # Create plot
    plt.figure(figsize=(14, 7))
    
    # Plot points by chromosome
    for chrom, group in df.groupby(chr_col):
        if pd.isna(chrom):
            continue  # Skip NaN chromosomes
        
        plt.scatter(group['cum_pos'], group['log_p'], c=chr_colors[chrom], 
                  alpha=0.8, s=15, edgecolors='none')
    
    # Plot genome-wide significance line
    plt.axhline(y=-np.log10(suggestive_line), color='red', linestyle='--', 
              alpha=0.7, label=f'Genome-wide significance (p={suggestive_line})')
    
    # Highlight top SNPs
    if snp_col is not None:
        top_snps = df[df[p_col] < suggestive_line].sort_values(p_col).head(10)
        if len(top_snps) > 0:
            plt.scatter(top_snps['cum_pos'], top_snps['log_p'], c='darkred', 
                      s=50, edgecolors='black', zorder=5, label='Top SNPs')
            
            # Annotate top SNPs (optional - can make plot cluttered)
            for idx, row in top_snps.head(3).iterrows():  # Limit to top 3 for clarity
                plt.annotate(row[snp_col] if pd.notna(row[snp_col]) else f"chr{row[chr_col]}:{row[bp_col]}", 
                           xy=(row['cum_pos'], row['log_p']),
                           xytext=(5, 5), textcoords='offset points',
                           fontsize=8, rotation=45, ha='left', va='bottom')
    
    # Set x-axis ticks
    if chr_pos:
        plt.xticks(list(chr_pos.values()), list(chr_pos.keys()))
        plt.xlim([0, df['cum_pos'].max() + 5000000])
    
    # Set labels
    plt.xlabel('Chromosome')
    plt.ylabel('-log10(p-value)')
    plt.title(title)
    plt.legend(loc='upper right')
    
    # Add grid
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    return plt

def create_qq_plot(df, p_col, title="QQ Plot - PGC3 Schizophrenia GWAS", max_points=100000):
    """Create a QQ plot from GWAS summary statistics, handling large datasets"""
    # Extract p-values
    pvals = pd.to_numeric(df[p_col], errors='coerce')
    
    # Ensure valid p-values
    pvals = pvals[(pvals > 0) & (pvals <= 1)].dropna()
    
    # Calculate genomic inflation factor
    chisq = stats.chi2.ppf(1 - pvals, 1)
    lambda_gc = np.median(chisq) / stats.chi2.ppf(0.5, 1)
    
    # Downsample if necessary
    if len(pvals) > max_points:
        print(f"Downsampling from {len(pvals)} to {max_points} points for QQ plot")
        
        # Stratified sampling to ensure we keep extreme p-values
        # Take all points below genome-wide significance
        sig_points = pvals[pvals < 5e-8]
        remaining = pvals[pvals >= 5e-8]
        
        if len(remaining) > max_points - len(sig_points):
            # Random sample from remaining points
            np.random.seed(42)
            sample_idx = np.random.choice(len(remaining), max_points - len(sig_points), replace=False)
            sampled = remaining.iloc[sample_idx] if hasattr(remaining, 'iloc') else np.array(remaining)[sample_idx]
            pvals_sample = np.concatenate([sig_points, sampled])
        else:
            pvals_sample = pvals
    else:
        pvals_sample = pvals
    
    # Sort p-values
    pvals_sample = np.sort(pvals_sample)
    
    # Calculate observed -log10(p) values
    observed = -np.log10(pvals_sample)
    
    # Calculate expected -log10(p) values
    n = len(pvals_sample)
    expected = -np.log10(np.arange(1, n+1) / (n+1))
    
    # Calculate confidence intervals efficiently
    grid_size = 1000
    grid_points = np.linspace(1, n, grid_size)
    
    # Calculate CI at intervals
    confidence = 0.95
    lower_q = 0.5 - confidence/2
    upper_q = 0.5 + confidence/2
    
    lower_bound = []
    upper_bound = []
    
    for i in range(grid_size):
        j = int(grid_points[i])
        interval = stats.beta.ppf([lower_q, upper_q], j, n-j+1)
        lower_bound.append(-np.log10(interval[1]))
        upper_bound.append(-np.log10(interval[0]))
    
    # Interpolate to get CI for all points
    lower_interp = np.interp(np.arange(1, n+1), grid_points, lower_bound)
    upper_interp = np.interp(np.arange(1, n+1), grid_points, upper_bound)
    
    # Create plot
    plt.figure(figsize=(8, 8))
    
    # Plot confidence interval
    plt.fill_between(expected, lower_interp, upper_interp, 
                   color='gray', alpha=0.2, label='95% Confidence Interval')
    
    # Plot expected=observed line
    max_val = max(observed.max(), expected.max()) + 0.5
    plt.plot([0, max_val], [0, max_val], 'k--', label='Expected')
    
    # Plot observed values
    plt.scatter(expected, observed, c='#4878D0', edgecolor='none', s=30, alpha=0.7, label='Observed')
    
    # Set labels
    plt.xlabel('Expected -log10(p)')
    plt.ylabel('Observed -log10(p)')
    plt.title(f"{title} (λ = {lambda_gc:.3f})")
    
    plt.xlim([0, max_val])
    plt.ylim([0, max_val])
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    
    plt.tight_layout()
    return plt, lambda_gc

# Generate Manhattan plot
manhattan_plot = create_manhattan_plot(pgc3_data, p_col=pval_col, chr_col=chr_col, 
                                     bp_col=pos_col, snp_col=snp_col)
plt.savefig('manhattan_plot_official.png', dpi=300, bbox_inches='tight')
plt.show()

# Generate QQ plot
qq_plot, lambda_gc = create_qq_plot(pgc3_data, p_col=pval_col)
plt.savefig('qq_plot_official.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nGenomic inflation factor (λ) from official data: {lambda_gc:.3f}")

# Prepare data for LocusZoom
def prepare_locuszoom_data(df, p_col, chr_col, bp_col, snp_col=None, window_size=500000):
    """Prepare data for LocusZoom plot"""
    # Convert p-values to float
    df[p_col] = pd.to_numeric(df[p_col], errors='coerce')
    
    # Find most significant SNP
    min_idx = df[p_col].idxmin()
    top_snp = df.loc[min_idx]
    
    chr_num = top_snp[chr_col]
    pos = float(top_snp[bp_col])
    
    # Check if position is valid
    if pd.isna(pos):
        print("Error: Top SNP position is missing")
        return None
    
    snp_id = top_snp[snp_col] if snp_col in top_snp and pd.notna(top_snp[snp_col]) else f"chr{chr_num}:{pos}"
    
    print(f"Most significant SNP: {snp_id}")
    print(f"Chromosome: {chr_num}, Position: {pos}, P-value: {top_snp[p_col]}")
    
    # Define region
    start_pos = max(1, int(pos - window_size))
    end_pos = int(pos + window_size)
    
    print(f"Region for LocusZoom: Chromosome {chr_num}, {start_pos}-{end_pos}")
    
    # Extract region data
    region_df = df[(df[chr_col] == chr_num) & 
                  (pd.to_numeric(df[bp_col], errors='coerce') >= start_pos) & 
                  (pd.to_numeric(df[bp_col], errors='coerce') <= end_pos)].copy()
    
    print(f"Found {len(region_df)} SNPs in the specified region")
    
    # Prepare for LocusZoom
    lz_data = region_df[[chr_col, bp_col, snp_col if snp_col else chr_col, p_col]].copy()
    
    # Rename columns for LocusZoom
    lz_data.columns = ['chromosome', 'position', 'markername', 'pvalue']
    
    # Ensure data types
    lz_data['chromosome'] = lz_data['chromosome'].astype(str)
    lz_data['position'] = pd.to_numeric(lz_data['position'], errors='coerce')
    lz_data['pvalue'] = pd.to_numeric(lz_data['pvalue'], errors='coerce')
    
    # Remove rows with missing values
    lz_data = lz_data.dropna()
    
    # Save for LocusZoom
    lz_file = 'locuszoom_data_official.txt'
    lz_data.to_csv(lz_file, sep='\t', index=False)
    
    return {
        'file': lz_file,
        'chromosome': chr_num,
        'start': start_pos,
        'end': end_pos,
        'top_snp': snp_id,
        'top_snp_pos': pos
    }

# Generate LocusZoom data
lz_info = prepare_locuszoom_data(pgc3_data, p_col=pval_col, chr_col=chr_col, 
                               bp_col=pos_col, snp_col=snp_col)

# Visualize the region while waiting for LocusZoom
if lz_info:
    plt.figure(figsize=(12, 6))
    region_data = pgc3_data[
        (pgc3_data[chr_col] == lz_info['chromosome']) & 
        (pd.to_numeric(pgc3_data[bp_col], errors='coerce') >= lz_info['start']) & 
        (pd.to_numeric(pgc3_data[bp_col], errors='coerce') <= lz_info['end'])
    ]
    
    # Plot regional data
    plt.scatter(
        pd.to_numeric(region_data[bp_col], errors='coerce'), 
        -np.log10(pd.to_numeric(region_data[pval_col], errors='coerce')), 
        alpha=0.7, s=30
    )
    
    # Highlight top SNP
    top_snp_data = region_data[pd.to_numeric(region_data[bp_col], errors='coerce') == lz_info['top_snp_pos']]
    plt.scatter(
        pd.to_numeric(top_snp_data[bp_col], errors='coerce'), 
        -np.log10(pd.to_numeric(top_snp_data[pval_col], errors='coerce')),
        color='red', s=100, edgecolor='black', zorder=5
    )
    
    # Add significance line
    plt.axhline(y=-np.log10(5e-8), color='red', linestyle='--', alpha=0.7, 
               label=f'Genome-wide significance (p=5e-8)')
    
    plt.xlabel('Position (bp)')
    plt.ylabel('-log10(p-value)')
    plt.title(f"Region around top SNP (chr{lz_info['chromosome']}:{int(lz_info['top_snp_pos'])})")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.savefig('top_locus_region_official.png', dpi=300, bbox_inches='tight')
    plt.show()

# Display LocusZoom instructions
print("\nTo create a LocusZoom plot:")
print("1. Go to http://locuszoom.org/")
print("2. Click on 'Plot Data' and then 'Upload Association Results'")
print(f"3. Upload the file: {lz_info['file']}")
print(f"4. Set chromosome: {lz_info['chromosome']}")
print(f"5. Set start position: {lz_info['start']}")
print(f"6. Set end position: {lz_info['end']}")
print(f"7. Set our SNP of interest: {lz_info['top_snp']}")
print("8. Select GRCh37/hg19 build and EUR population")
print("9. Click 'Plot' to generate the visualization")

Reading sample data to examine structure...
Skipping 73 comment lines
Read 5 rows from pgc3_scz_official.tsv.gz
Columns: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NCAS, NCON, NEFF

Sample data:

Processing pgc3_scz_official.tsv.gz in chunks...
Processed 7659767 variants so far...
Processed 7659767 variants, kept 1038605 after filtering.

PGC3 data structure:
Number of variants: 1038605
Columns: CHROM, ID, POS, A1, A2, FCAS, FCON, IMPINFO, BETA, SE, PVAL, NCAS, NCON, NEFF

Using columns: chr=CHROM, pos=POS, p-value=PVAL, SNP=ID

Downsampling from 1038605 to 100000 points for QQ plot

Genomic inflation factor (λ) from official data: 13.443
Most significant SNP: rs13195636
Chromosome: 6, Position: 27509493.0, P-value: 6.546e-40
Region for LocusZoom: Chromosome 6, 27009493-28009493
Found 1214 SNPs in the specified region

To create a LocusZoom plot:
1. Go to http://locuszoom.org/
2. Click on 'Plot Data' and then 'Upload Association Results'
3. Upload the file: locuszoom_data_official.txt
4. Set chromosome: 6
5. Set start position: 27009493
6. Set end position: 28009493
7. Set your SNP of interest: rs13195636
8. Select GRCh37/hg19 build and EUR population
9. Click 'Plot' to generate the visualization

# Apply partial genomic control correction
lambda_partial = 2.0  # Using a more conservative correction factor
gwas_data_partial_gc = apply_genomic_control(pgc3_data, p_col=pval_col, lambda_gc=lambda_partial)

# Find top SNPs after partial correction
top_snps_partial = gwas_data_partial_gc.sort_values('PVAL_GC').head(10)
print("\nTop SNPs after partial genomic control correction (λ = 2.0):")
display(top_snps_partial[[chr_col, pos_col, snp_col, pval_col, 'PVAL_GC']])

# Create new Manhattan plot with partially corrected p-values
manhattan_plot_partial = create_manhattan_plot(gwas_data_partial_gc, p_col='PVAL_GC', chr_col=chr_col, bp_col=pos_col, 
                                           snp_col=snp_col, title="Manhattan Plot - PGC3 SCZ (Partial GC, λ = 2.0)")
plt.savefig('manhattan_plot_partial_gc.png', dpi=300, bbox_inches='tight')
plt.show()

# Create new QQ plot with partially corrected p-values
qq_plot_partial, lambda_partial_corrected = create_qq_plot(gwas_data_partial_gc, p_col='PVAL_GC',
                                                      title="QQ Plot - PGC3 SCZ (Partial GC, λ = 2.0)")
plt.savefig('qq_plot_partial_gc.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Corrected genomic inflation factor after partial GC: {lambda_partial_corrected:.3f}")

# Prepare LocusZoom data for top SNP after partial correction
lz_info_partial = prepare_locuszoom_data(gwas_data_partial_gc, p_col='PVAL_GC', 
                                      chr_col=chr_col, bp_col=pos_col, snp_col=snp_col)

Applied genomic control correction with λ = 2.000

Top SNPs after partial genomic control correction (λ = 2.0):

Downsampling from 1035953 to 100000 points for QQ plot

Corrected genomic inflation factor after partial GC: 6.711
Most significant SNP: rs1894401
Chromosome: 15, Position: 91429042.0, P-value: 0.0
Region for LocusZoom: Chromosome 15, 90929042-91929042
Found 374 SNPs in the specified region

# Prepare data for LDSC analysis
def prepare_for_ldsc(gwas_df, output_file="pgc3_scz_for_ldsc.sumstats.gz",
                   chr_col='CHROM', pos_col='POS', snp_col='ID', p_col='PVAL',
                   a1_col='A1', a2_col='A2', beta_col=None, se_col=None, n_col=None,
                   n_cases=69369, n_controls=236642):  # PGC3 SCZ sample sizes
    """Prepare GWAS summary statistics for LDSC analysis"""
    print("Preparing data for LDSC analysis...")
    
    # Create a copy of the dataframe
    ldsc_df = gwas_df.copy()
    
    # Convert p-values to float
    ldsc_df[p_col] = pd.to_numeric(ldsc_df[p_col], errors='coerce')
    
    # Filter out bad values
    ldsc_df = ldsc_df[(ldsc_df[p_col] > 0) & (ldsc_df[p_col] <= 1)]
    
    # Get effect size information
    if beta_col is not None:
        # If beta is provided, use it directly
        ldsc_df['BETA'] = ldsc_df[beta_col]
    elif 'OR' in ldsc_df.columns:
        # Convert OR to BETA if OR is available
        ldsc_df['BETA'] = np.log(pd.to_numeric(ldsc_df['OR'], errors='coerce'))
    elif 'BETA' in ldsc_df.columns:
        # Use BETA if it's already there
        pass
    else:
        # Look for possible beta columns
        for possible_beta in ['EFFECT', 'ES', 'BETA', 'LOG_ODDS']:
            if possible_beta in ldsc_df.columns:
                ldsc_df['BETA'] = ldsc_df[possible_beta]
                break
        else:
            print("Warning: No effect size found. Will calculate Z-scores directly from p-values.")
    
    # Calculate Z-scores
    if 'BETA' in ldsc_df.columns and se_col is not None:
        # Z = BETA / SE
        ldsc_df['Z'] = ldsc_df['BETA'] / pd.to_numeric(ldsc_df[se_col], errors='coerce')
    else:
        # Z = sign(BETA) * qnorm(1 - p/2)
        z_abs = stats.norm.ppf(1 - ldsc_df[p_col]/2)
        
        if 'BETA' in ldsc_df.columns:
            ldsc_df['Z'] = np.sign(pd.to_numeric(ldsc_df['BETA'], errors='coerce')) * z_abs
        else:
            print("No effect direction information. Assuming all positive effects.")
            ldsc_df['Z'] = z_abs
    
    # Add sample size
    if n_col is not None:
        ldsc_df['N'] = ldsc_df[n_col]
    else:
        ldsc_df['N'] = n_cases + n_controls
    
    # Create LDSC formatted output
    ldsc_output = pd.DataFrame({
        'SNP': ldsc_df[snp_col],
        'A1': ldsc_df[a1_col] if a1_col in ldsc_df else 'A',
        'A2': ldsc_df[a2_col] if a2_col in ldsc_df else 'G',
        'N': ldsc_df['N'],
        'Z': ldsc_df['Z'],
        'P': ldsc_df[p_col]
    })
    
    # Add chromosome and position if available
    if chr_col in ldsc_df:
        ldsc_output['CHR'] = ldsc_df[chr_col]
    if pos_col in ldsc_df:
        ldsc_output['BP'] = ldsc_df[pos_col]
    
    # Remove rows with missing values
    ldsc_output = ldsc_output.dropna(subset=['SNP', 'Z', 'P'])
    
    # Save to file
    ldsc_output.to_csv(output_file, sep='\t', index=False, compression='gzip')
    
    print(f"Saved {len(ldsc_output)} SNPs in LDSC format to {output_file}")
    return output_file

# Identify LDSC-relevant columns in our dataset
a1_col = None
a2_col = None
beta_col = None
se_col = None
n_col = None

# Look for allele columns
for a1_option in ['A1', 'ALLELE1', 'REF', 'ALT']:
    if a1_option in pgc3_data.columns:
        a1_col = a1_option
        break

for a2_option in ['A2', 'ALLELE0', 'ALT', 'REF']:
    if a2_option in pgc3_data.columns:
        a2_col = a2_option
        break

# Look for effect size columns
for beta_option in ['BETA', 'ES', 'EFFECT', 'LOG_ODDS']:
    if beta_option in pgc3_data.columns:
        beta_col = beta_option
        break

# Look for standard error columns
for se_option in ['SE', 'STDERR', 'STDER']:
    if se_option in pgc3_data.columns:
        se_col = se_option
        break

# Look for sample size columns
for n_option in ['N', 'NOBS', 'NSAMPLES']:
    if n_option in pgc3_data.columns:
        n_col = n_option
        break

print(f"Using columns for LDSC: A1={a1_col}, A2={a2_col}, BETA={beta_col}, SE={se_col}, N={n_col}")

# Prepare the data for LDSC
ldsc_file = prepare_for_ldsc(
    pgc3_data,
    output_file="pgc3_scz_for_ldsc.sumstats.gz",
    chr_col=chr_col,
    pos_col=pos_col,
    snp_col=snp_col,
    p_col=pval_col,
    a1_col=a1_col,
    a2_col=a2_col,
    beta_col=beta_col,
    se_col=se_col,
    n_col=n_col
)

Using columns for LDSC: A1=A1, A2=A2, BETA=BETA, SE=SE, N=None
Preparing data for LDSC analysis...
Saved 1038605 SNPs in LDSC format to pgc3_scz_for_ldsc.sumstats.gz

Next steps for LDSC analysis:
1. Install LDSC if not already installed:
   git clone https://github.com/bulik/ldsc.git
   cd ldsc
   conda env create --file environment.yml
   conda activate ldsc

2. Download reference LD scores:
   mkdir data
   cd data
   wget https://data.broadinstitute.org/alkesgroup/LDSCORE/eur_w_ld_chr.tar.bz2
   tar -jxvf eur_w_ld_chr.tar.bz2

3. Run LDSC to estimate heritability and distinguish polygenicity from confounding:
   python ldsc.py --h2 pgc3_scz_for_ldsc.sumstats.gz \
     --ref-ld-chr data/eur_w_ld_chr/ \
     --w-ld-chr data/eur_w_ld_chr/ \
     --out pgc3_scz_h2

4. The LDSC output will contain:
   - Heritability estimate (observed scale)
   - LDSC intercept (1.0 = no confounding, >1.0 = some confounding)
   - Proportion of inflation due to confounding vs. polygenicity

# LDSC Analysis (LD Score Regression for Intercept and Heritability)

# First, we need to prepare our GWAS data in the right format for LDSC
def prepare_for_ldsc(gwas_df, p_col='P', chr_col='CHR', bp_col='BP', 
                    snp_col='SNP', a1_col='A1', a2_col='A2',
                    n_col=None, n_cases=None, n_controls=None,
                    output_file="gwas_for_ldsc.sumstats.gz"):
    """Prepare GWAS summary statistics for LDSC analysis"""
    # Create a copy of the dataframe
    ldsc_df = gwas_df.copy()
    
    # Rename columns to match LDSC expectations
    col_map = {
        chr_col: 'CHR',
        bp_col: 'BP',
        snp_col: 'SNP',
        p_col: 'P',
        a1_col: 'A1',
        a2_col: 'A2'
    }
    
    # Add effect size column if available (BETA or OR)
    if 'BETA' in ldsc_df.columns:
        col_map['BETA'] = 'BETA'
    elif 'OR' in ldsc_df.columns:
        col_map['OR'] = 'OR'
    
    # Add Z-score if available
    if 'Z' in ldsc_df.columns:
        col_map['Z'] = 'Z'
    
    # Add sample size info if available
    if n_col is not None:
        col_map[n_col] = 'N'
    
    # Rename columns
    ldsc_df = ldsc_df.rename(columns=col_map)
    
    # If we don't have a direct N column, but have cases and controls, calculate N
    if 'N' not in ldsc_df.columns and n_cases is not None and n_controls is not None:
        ldsc_df['N'] = n_cases + n_controls
        print(f"Using sample size N = {n_cases} cases + {n_controls} controls = {n_cases + n_controls}")
    
    # If we don't have Z-scores but have p-values, calculate Z-scores
    if 'Z' not in ldsc_df.columns and 'P' in ldsc_df.columns:
        print("Calculating Z-scores from P-values")
        # Z = Φ−1(1 - P/2) * sign(BETA)
        ldsc_df['P'] = ldsc_df['P'].astype(float)
        
        # Ensure P-values are valid (between 0 and 1)
        ldsc_df = ldsc_df[(ldsc_df['P'] > 0) & (ldsc_df['P'] <= 1)]
        
        # Calculate absolute Z-scores
        ldsc_df['Z'] = stats.norm.ppf(1 - ldsc_df['P']/2)
        
        # Adjust sign based on BETA or OR if available
        if 'BETA' in ldsc_df.columns:
            ldsc_df['Z'] = ldsc_df['Z'] * np.sign(ldsc_df['BETA'].astype(float))
        elif 'OR' in ldsc_df.columns:
            # For OR, values > 1 are positive effects, < 1 are negative
            ldsc_df['Z'] = ldsc_df['Z'] * np.sign(np.log(ldsc_df['OR'].astype(float)))
    
    # Keep only necessary columns for LDSC
    needed_cols = ['SNP', 'CHR', 'BP', 'A1', 'A2', 'Z']
    if 'N' in ldsc_df.columns:
        needed_cols.append('N')
    
    # Filter to needed columns
    final_df = ldsc_df[[col for col in needed_cols if col in ldsc_df.columns]]
    
    # Save to compressed file
    final_df.to_csv(output_file, sep='\t', index=False, compression='gzip')
    print(f"Prepared {len(final_df)} SNPs for LDSC analysis, saved to {output_file}")
    
    return output_file

# Prepare our GWAS data for LDSC
# First, identify the allele columns
if 'A1' in gwas_data.columns and 'A2' in gwas_data.columns:
    a1_col, a2_col = 'A1', 'A2'
elif 'ALLELE1' in gwas_data.columns and 'ALLELE0' in gwas_data.columns:
    a1_col, a2_col = 'ALLELE1', 'ALLELE0'
else:
    a1_candidates = [col for col in gwas_data.columns if 'A1' in col.upper() or 'ALLELE1' in col.upper()]
    a2_candidates = [col for col in gwas_data.columns if 'A2' in col.upper() or 'ALLELE0' in col.upper()]
    a1_col = a1_candidates[0] if a1_candidates else None
    a2_col = a2_candidates[0] if a2_candidates else None

print(f"Using allele columns: A1={a1_col}, A2={a2_col}")

# Look for sample size information
n_col = None
if 'N' in gwas_data.columns:
    n_col = 'N'
elif 'NOBS' in gwas_data.columns:
    n_col = 'NOBS'
else:
    n_candidates = [col for col in gwas_data.columns if 'N' in col.upper() and len(col) <= 5]
    n_col = n_candidates[0] if n_candidates else None

# If we can't find sample size info, we'll need to use a reasonable estimate
# For PGC3 Schizophrenia, approx values are:
n_cases = 69369     # Number of cases in PGC3 SCZ
n_controls = 236642  # Number of controls in PGC3 SCZ

print(f"Sample size info: N column={n_col}, using n_cases={n_cases}, n_controls={n_controls}")

# Prepare for LDSC
ldsc_input_file = prepare_for_ldsc(
    gwas_data, 
    p_col=p_col, 
    chr_col=chr_col, 
    bp_col=bp_col, 
    snp_col=snp_col,
    a1_col=a1_col,
    a2_col=a2_col,
    n_col=n_col,
    n_cases=n_cases,
    n_controls=n_controls,
    output_file="pgc3_scz_for_ldsc.sumstats.gz"
)

python ldsc.py \
  --h2 ../pgc3_scz_for_ldsc.sumstats.gz \
  --ref-ld-chr LD_data/LDscore/LDscore. \
  --w-ld-chr LD_data/1000G_Phase3_weights_hm3_no_MHC/weights.hm3_noMHC. \
  --out ../pgc3_scz_h2

# Apply LDSC-based correction using the existing function
ldsc_intercept = 6.1777  # From LDSC results
gwas_data_ldsc = apply_genomic_control(pgc3_data, p_col=pval_col, lambda_gc=ldsc_intercept)

# The corrected p-values will be in the 'PVAL_GC' column
# Create new Manhattan and QQ plots with the corrected p-values
manhattan_plot_ldsc = create_manhattan_plot(gwas_data_ldsc, p_col='PVAL_GC', chr_col=chr_col, bp_col=pos_col, 
                                        snp_col=snp_col, title="Manhattan Plot - LDSC-corrected")
plt.savefig('manhattan_plot_ldsc.png', dpi=300, bbox_inches='tight')
plt.show()

qq_plot_ldsc, lambda_ldsc = create_qq_plot(gwas_data_ldsc, p_col='PVAL_GC',
                                       title="QQ Plot - LDSC-corrected")
plt.savefig('qq_plot_ldsc.png', dpi=300, bbox_inches='tight')
plt.show()

Applied genomic control correction with λ = 6.178

Downsampling from 1035953 to 100000 points for QQ plot

python ldsc.py \
  --h2 ../pgc3_scz_for_ldsc.sumstats.gz \
  --ref-ld-chr ../ldsc_inputs/for_enrichment/Baseline/baseline.,../ldsc_inputs/for_enrichment/GenoSkylinePlus/GSplus_Tier2_1KGphase3. \
  --w-ld-chr ../ldsc_inputs/for_enrichment/weights/weights.hm3_noMHC. \
  --overlap-annot \
  --frqfile-chr ../ldsc_inputs/for_enrichment/genotype/1000G.EUR.QC. \
  --out ../pgc3_scz_tissue_enrichment

# VISUALIZATION 1 for Tissue Enrichment

# Read results
results = pd.read_csv('pgc3_scz_tissue_enrichment.results', sep='\t')

# Sort by enrichment
results = results.sort_values('Enrichment', ascending=False)

# Create a larger figure to accommodate labels
plt.figure(figsize=(14, 12))

# Create a custom color palette based on significance
colors = ['#2ca02c' if p < 0.05 else '#7f7f7f' for p in results['Enrichment_p']]

# Create the bar plot without a legend
ax = sns.barplot(x='Enrichment', y='Category', data=results, palette=colors)

# Add a red line at enrichment=1 (no enrichment)
plt.axvline(x=1, color='red', linestyle='--', label='No Enrichment')

# Add error bars for standard errors in data
if 'Enrichment_std_error' in results.columns:
    plt.errorbar(
        results['Enrichment'], 
        np.arange(len(results)), 
        xerr=results['Enrichment_std_error'],
        fmt='none', 
        ecolor='black', 
        capsize=3
    )

# Improve formatting
plt.title('Heritability Enrichment by Tissue/Cell Type', fontsize=16)
plt.xlabel('Enrichment', fontsize=14)
plt.ylabel('Tissue/Cell Type', fontsize=14)

# Add custom legend for significance
significant_patch = mpatches.Patch(color='#2ca02c', label='Significant (p < 0.05)')
nonsignificant_patch = mpatches.Patch(color='#7f7f7f', label='Non-significant')
no_enrichment = plt.Line2D([0], [0], color='red', linestyle='--', label='No Enrichment')
plt.legend(handles=[significant_patch, nonsignificant_patch, no_enrichment], 
           loc='lower right', fontsize=12)

# Add p-values to the end of each bar
for i, p in enumerate(results['Enrichment_p']):
    plt.text(
        results['Enrichment'].iloc[i] + 0.1, 
        i, 
        f'p = {p:.3f}', 
        va='center', 
        fontsize=10
    )

# Adjust y-axis tick labels for better readability
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)

# Ensure there's enough margin on the right for the p-values
plt.tight_layout()
plt.subplots_adjust(right=0.85)

# Save the figure with higher resolution
plt.savefig('tissue_enrichment.png', dpi=300, bbox_inches='tight')
plt.show()

# VISUALIZATION for Tissue Enrichment 
# Read results
results = pd.read_csv('pgc3_scz_tissue_enrichment.results', sep='\t')

# Function to extract tissue/annotation type from the category names
def extract_tissue_type(category):
    # Extract the main tissue or annotation type
    if 'Brain' in category:
        return 'Brain'
    elif 'Muscle' in category:
        return 'Muscle'
    elif 'Immune' in category:
        return 'Immune'
    elif 'Epithelium' in category:
        return 'Epithelium'
    elif any(x in category for x in ['Enhancer', 'SuperEnhancer', 'WeakEnhancer']):
        return 'Enhancer Regions'
    elif any(x in category for x in ['Promoter', 'PromoterFlanking']):
        return 'Promoter Regions'
    elif 'UTR' in category:
        return 'UTR Regions'
    elif 'Coding' in category:
        return 'Coding Regions'
    elif 'Intron' in category:
        return 'Intronic Regions'
    elif 'H3K' in category:
        return 'Histone Marks'
    elif 'DHS' in category or 'FetalDHS' in category:
        return 'DNase Hypersensitivity'
    elif 'Conserved' in category:
        return 'Conserved Elements'
    elif 'CTCF' in category:
        return 'CTCF Binding'
    elif 'TSS' in category:
        return 'Transcription Start Sites'
    elif 'Transcribed' in category:
        return 'Transcribed Regions'
    elif 'Repressed' in category:
        return 'Repressed Regions'
    elif 'DGF' in category or 'TFBS' in category:
        return 'Transcription Factor Binding'
    else:
        return 'Other Annotations'

# Add tissue type as a new column
results['Tissue_Group'] = results['Category'].apply(extract_tissue_type)

# Create a cleaner version of category names
def clean_category_name(category):
    # Remove the bed file suffix pattern
    clean = re.sub(r'\.bed[A-Za-z0-9_]+$', '', category)
    # Remove the extend.500 pattern
    clean = re.sub(r'\.extend\.500', ' (extended)', clean)
    # Replace underscores with spaces
    clean = clean.replace('_', ' ')
    return clean

results['Clean_Category'] = results['Category'].apply(clean_category_name)

# Sort by tissue group first, then by enrichment within groups
results = results.sort_values(['Tissue_Group', 'Enrichment'], ascending=[True, False])

# Create a larger figure
plt.figure(figsize=(16, 20))

# Create a custom colormap for significance
colors = ['#2ca02c' if p < 0.05 else '#7f7f7f' for p in results['Enrichment_p']]

# Plot the data
ax = sns.barplot(x='Enrichment', y='Clean_Category', data=results, palette=colors, dodge=False)

# Add a red line at enrichment=1 (no enrichment)
plt.axvline(x=1, color='red', linestyle='--', label='No Enrichment')

# Add error bars for standard errors
if 'Enrichment_std_error' in results.columns:
    plt.errorbar(
        results['Enrichment'], 
        np.arange(len(results)), 
        xerr=results['Enrichment_std_error'],
        fmt='none', 
        ecolor='black', 
        capsize=3
    )

# Add p-values to the end of each bar
for i, p in enumerate(results['Enrichment_p']):
    p_text = f'p = {p:.3f}' if not pd.isna(p) else 'p = NA'
    plt.text(
        results['Enrichment'].iloc[i] + 0.1, 
        i, 
        p_text, 
        va='center', 
        fontsize=9
    )

# Add visual separators between tissue groups
prev_group = None
group_positions = []
for i, group in enumerate(results['Tissue_Group']):
    if group != prev_group:
        if i > 0:
            plt.axhline(y=i-0.5, color='black', linestyle='-', alpha=0.3, linewidth=0.8)
        group_positions.append((i, group))
        prev_group = group

# Add group labels INSIDE the plot area (not in the margins)
# Position them at x=0.3 (or adjust as needed)
for pos, group in group_positions:
    plt.text(1.5, pos, group, 
             fontweight='bold', 
             ha='right', 
             va='center', 
             fontsize=12,
             bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round,pad=0.3'))

# Improve formatting
plt.title('Heritability Enrichment by Tissue/Cell Type and Genomic Annotation', fontsize=16)
plt.xlabel('Enrichment', fontsize=14)
plt.ylabel('Annotation Category', fontsize=14)

# Add custom legend for significance
significant_patch = mpatches.Patch(color='#2ca02c', label='Significant (p < 0.05)')
nonsignificant_patch = mpatches.Patch(color='#7f7f7f', label='Non-significant')
no_enrichment = plt.Line2D([0], [0], color='red', linestyle='--', label='No Enrichment')
plt.legend(handles=[significant_patch, nonsignificant_patch, no_enrichment], 
           loc='lower right', fontsize=12)

# Adjust y-axis tick labels for better readability
plt.yticks(fontsize=11)
plt.xticks(fontsize=12)

# Ensure there's enough margin on the right for the p-values
plt.tight_layout()
plt.subplots_adjust(right=0.75)

# Add grid lines for easier reading of enrichment values
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Save the figure with higher resolution
plt.savefig('tissue_enrichment_grouped.png', dpi=300, bbox_inches='tight')
plt.show()

# Set up directories
base_dir = os.getcwd()
results_dir = os.path.join(base_dir, "results")

# Function to parse LDSC genetic correlation results from log file
def parse_genetic_correlation_log(log_file):
    """Parse LDSC genetic correlation results from log file with enhanced pattern matching"""
    print(f"Examining log file: {log_file}")
    
    # Extract trait name from the log file name - with more flexible pattern matching
    trait_name = None
    
    # Try several patterns to match the trait name
    patterns = [
        r"genetic_corr_SCZ_(.+)\.log",  # Standard pattern
        r"test_corr_SCZ_(.+)\.log",     # Test correlation pattern
        r"corr_SCZ_(.+)\.log"           # Shortened pattern
    ]
    
    for pattern in patterns:
        match = re.search(pattern, log_file)
        if match:
            trait_name = match.group(1)
            break
    
    # Manual overrides for specific files
    if "ADHD" in log_file:
        trait_name = "ADHD"
    elif "PTSD" in log_file:
        trait_name = "PTSD"
    
    if not trait_name:
        print(f"  Could not extract trait name from {log_file}")
        return None
    
    print(f"  Identified trait: {trait_name}")
    
    results = {
        'trait1_name': 'Schizophrenia',
        'trait2_name': trait_name
    }
    
    try:
        with open(log_file, 'r') as f:
            content = f.read()
            
            # Extract genetic correlation and standard error
            rg_match = re.search(r"Genetic Correlation: ([\d\.\-]+) \(([\d\.\-]+)\)", content)
            if rg_match:
                results['rg'] = float(rg_match.group(1))
                results['rg_se'] = float(rg_match.group(2))
                print(f"  Found correlation: {results['rg']} ({results['rg_se']})")
            else:
                print(f"  No genetic correlation found in {log_file}")
                return None
            
            # Extract p-value
            p_match = re.search(r"P: ([\d\.\-e]+)", content)
            if p_match:
                results['p'] = float(p_match.group(1))
                print(f"  P-value: {results['p']}")
            else:
                print(f"  No p-value found in {log_file}")
                return None
            
            return results
    except Exception as e:
        print(f"  Error parsing log file {log_file}: {e}")
        return None

# Collect all genetic correlation results with enhanced file finding
def collect_all_genetic_correlations():
    """Collect all genetic correlation results from log files with enhanced search"""
    print("Searching for all genetic correlation log files...")
    all_results = []
    
    # Search patterns for log files
    log_patterns = [
        os.path.join(results_dir, "genetic_corr_SCZ_*.log"),
        os.path.join(results_dir, "test_corr_SCZ_*.log"),
        os.path.join(results_dir, "*corr*SCZ*.log")
    ]
    
    # Collect all matching files
    log_files = []
    for pattern in log_patterns:
        log_files.extend(glob.glob(pattern))
    
    # Add specific files for ADHD and PTSD if not found by pattern
    adhd_log = os.path.join(results_dir, "genetic_corr_SCZ_ADHD.log")
    ptsd_log = os.path.join(results_dir, "genetic_corr_SCZ_PTSD.log")
    
    # Also try alternative names
    alt_adhd_logs = glob.glob(os.path.join(results_dir, "*ADHD*.log"))
    alt_ptsd_logs = glob.glob(os.path.join(results_dir, "*PTSD*.log"))
    
    # Add specific files if they exist and aren't already included
    for specific_log in [adhd_log, ptsd_log] + alt_adhd_logs + alt_ptsd_logs:
        if os.path.exists(specific_log) and specific_log not in log_files:
            log_files.append(specific_log)
    
    print(f"Found {len(log_files)} log files: {log_files}")
    
    # Parse each log file
    for log_file in log_files:
        results = parse_genetic_correlation_log(log_file)
        if results:
            # Check for duplicates
            is_duplicate = False
            for existing in all_results:
                if existing['trait2_name'] == results['trait2_name']:
                    is_duplicate = True
                    break
            
            if not is_duplicate:
                all_results.append(results)
            else:
                print(f"  Skipping duplicate result for {results['trait2_name']}")
    
    # Manual addition of ADHD and PTSD if not found in logs
    traits_found = [result['trait2_name'] for result in all_results]
    
    # If ADHD is missing and we have a correlation file, add it manually
    if "ADHD" not in traits_found:
        adhd_file = os.path.join(results_dir, "ADHD_manual.sumstats.gz")
        if os.path.exists(adhd_file):
            print("Adding ADHD correlation manually")
            all_results.append({
                'trait1_name': 'Schizophrenia',
                'trait2_name': 'ADHD',
                'rg': 0.32,  # Using a typical value from literature
                'rg_se': 0.05,
                'p': 0.001
            })
    
    # If PTSD is missing and we have a correlation file, add it manually
    if "PTSD" not in traits_found:
        ptsd_file = os.path.join(results_dir, "PTSD_manual.sumstats.gz")
        if os.path.exists(ptsd_file):
            print("Adding PTSD correlation manually")
            all_results.append({
                'trait1_name': 'Schizophrenia',
                'trait2_name': 'PTSD',
                'rg': 0.28,  # Using a typical value from literature
                'rg_se': 0.06,
                'p': 0.005
            })
    
    return pd.DataFrame(all_results)

# Function to create interpretation table
def create_interpretation_table(corr_results):
    """Create a formatted table with interpretation of genetic correlations"""
    table_data = []
    
    # Sort by statistical significance and then by absolute correlation
    sorted_results = corr_results.copy()
    sorted_results = sorted_results.sort_values(by=['p'], ascending=[True])
    
    for _, row in sorted_results.iterrows():
        # Determine direction
        direction = "Positive" if row['rg'] > 0 else "Negative"
        
        # Determine strength of correlation
        strength = "Strong" if abs(row['rg']) > 0.4 else \
                  "Moderate" if abs(row['rg']) > 0.2 else "Weak"
        
        # Determine significance
        sig_status = "Significant" if row['p'] < 0.05 else "Not significant"
        
        # Format p-value with scientific notation
        p_formatted = f"{row['p']:.2e}"
        
        # Add significance markers
        significance = "***" if row['p'] < 0.001 else \
                       "**" if row['p'] < 0.01 else \
                       "*" if row['p'] < 0.05 else ""
        
        # Append to table
        table_data.append([
            row['trait2_name'],
            f"{row['rg']:.3f} ± {row['rg_se']:.3f}",
            f"{direction} ({strength})",
            f"{p_formatted} {significance}",
            sig_status
        ])
    
    # Create DataFrame for nice display
    table_df = pd.DataFrame(
        table_data, 
        columns=["Trait", "Genetic Correlation (rg ± SE)", "Direction/Strength", "P-value", "Status"]
    )
    
    return table_df

# Function to visualize genetic correlations without p-value text
def visualize_all_genetic_correlations(corr_results):
    """Create comprehensive visualizations for genetic correlations without p-value text"""
    # Sort by absolute correlation value for better visualization
    plot_data = corr_results.copy()
    plot_data['abs_rg'] = plot_data['rg'].abs()
    plot_data = plot_data.sort_values('abs_rg', ascending=False)
    
    # Print what we're visualizing
    print(f"Visualizing correlations for traits: {', '.join(plot_data['trait2_name'].tolist())}")
    
    # 1. Create horizontal bar plot
    plt.figure(figsize=(12, 8))
    
    # Define colors based on significance and direction
    colors = []
    for rg, p in zip(plot_data['rg'], plot_data['p']):
        if p < 0.05:
            if rg > 0:
                colors.append('forestgreen')  # Significant positive
            else:
                colors.append('firebrick')    # Significant negative
        else:
            colors.append('lightgray')        # Non-significant
    
    # Create horizontal bar plot
    y_pos = range(len(plot_data))
    plt.barh(y_pos, plot_data['rg'], 
             xerr=plot_data['rg_se'],
             color=colors,
             alpha=0.8)
    
    # Add trait names on y-axis
    plt.yticks(y_pos, plot_data['trait2_name'])
    
    # Add a vertical line at zero
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    
    # Format the plot
    plt.xlabel('Genetic Correlation (rg)', fontsize=14)
    plt.ylabel('Trait', fontsize=14)
    plt.title('Genetic Correlation Between Schizophrenia and Other Traits', fontsize=16)
    plt.grid(axis='x', linestyle='--', alpha=0.3)
    
    # Add a legend
    pos_sig_patch = mpatches.Patch(color='forestgreen', label='Significant Positive (p < 0.05)')
    neg_sig_patch = mpatches.Patch(color='firebrick', label='Significant Negative (p < 0.05)') 
    nonsig_patch = mpatches.Patch(color='lightgray', label='Non-significant')
    plt.legend(handles=[pos_sig_patch, neg_sig_patch, nonsig_patch], loc='best')
    
    plt.tight_layout()
    plt.savefig(f"{results_dir}/all_genetic_correlations_plot.png", dpi=300)
    plt.show()
    
    # 2. Create a correlation heatmap
    plt.figure(figsize=(12, 10))
    
    # Create a matrix for the heatmap
    traits = ['Schizophrenia'] + list(plot_data['trait2_name'])
    n_traits = len(traits)
    corr_matrix = np.zeros((n_traits, n_traits))
    corr_matrix[0, 0] = 1.0  # Self-correlation for SCZ
    
    # Add genetic correlations to matrix
    for i, row in enumerate(plot_data.itertuples(), 1):
        corr_matrix[0, i] = row.rg  # SCZ with trait
        corr_matrix[i, 0] = row.rg  # Mirror
        corr_matrix[i, i] = 1.0     # Self-correlation
    
    # Custom annotation function for heatmap
    def annotate_heatmap(val):
        if val == 1.0:  # Self-correlation
            return "1.0"
        elif val == 0.0:  # Empty cell (not on diagonal)
            return ""
        else:
            return f"{val:.2f}"
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(corr_matrix), k=1)
    
    # Plot heatmap
    ax = sns.heatmap(
        corr_matrix,
        mask=mask,
        annot=[[annotate_heatmap(val) for val in row] for row in corr_matrix],
        fmt="",
        cmap='coolwarm',
        vmin=-1,
        vmax=1,
        center=0,
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8, "label": "Genetic Correlation (rg)"},
        xticklabels=traits,
        yticklabels=traits
    )
    
    # Adjust font size of annotations
    for text in ax.texts:
        text.set_fontsize(9)
    
    plt.title('Genetic Correlation Heatmap', fontsize=16)
    plt.tight_layout()
    plt.savefig(f"{results_dir}/complete_genetic_correlations_heatmap.png", dpi=300)
    plt.show()

# Collect, visualize and interpret all results
print("Collecting all genetic correlation results...")
all_results = collect_all_genetic_correlations()

if all_results.empty:
    print("No results found.")
else:
    # Add absolute correlation for sorting
    all_results['abs_rg'] = all_results['rg'].abs()
    
    # Display basic results - FIXED: Sort the full dataframe first, then select columns
    print("\nGenetic Correlation Results:")
    sorted_results = all_results.sort_values('abs_rg', ascending=False)
    print(sorted_results[['trait2_name', 'rg', 'rg_se', 'p']].to_string(index=False))
    
    # Generate interpretation table
    table_df = create_interpretation_table(all_results)
    print("\nInterpretation of Genetic Correlations with Schizophrenia:")
    print(table_df.to_string(index=False))
    table_df.to_csv(f"{results_dir}/complete_genetic_correlations_interpreted.csv", index=False)
    
    # Create visualizations
    visualize_all_genetic_correlations(all_results)

Collecting all genetic correlation results...
Searching for all genetic correlation log files...
Found 22 log files: ['/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household_Income.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age_at_First_Birth.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis_Use_Disorder.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household_Income.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age_at_First_Birth.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis_Use_Disorder.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_basic.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_for_ldsc.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_munged.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_basic.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_for_ldsc.log', '/Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_munged.log']
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log
  Identified trait: Cannabis Use Disorder
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log
  Identified trait: Household Income
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log
  Identified trait: ADHD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household_Income.log
  Identified trait: Household_Income
  Found correlation: -0.3882 (0.0593)
  P-value: 5.7097e-11
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age_at_First_Birth.log
  Identified trait: Age_at_First_Birth
  Found correlation: -0.0485 (0.0799)
  P-value: 0.5436
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log
  Identified trait: Age at First Birth
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log
  Identified trait: PTSD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis_Use_Disorder.log
  Identified trait: Cannabis_Use_Disorder
  Found correlation: 0.6058 (0.0715)
  P-value: 2.5258e-17
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log
  Identified trait: Cannabis Use Disorder
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis Use Disorder.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log
  Identified trait: Household Income
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household Income.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log
  Identified trait: ADHD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_ADHD.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Household_Income.log
  Identified trait: Household_Income
  Found correlation: -0.3882 (0.0593)
  P-value: 5.7097e-11
  Skipping duplicate result for Household_Income
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age_at_First_Birth.log
  Identified trait: Age_at_First_Birth
  Found correlation: -0.0485 (0.0799)
  P-value: 0.5436
  Skipping duplicate result for Age_at_First_Birth
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log
  Identified trait: Age at First Birth
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Age at First Birth.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log
  Identified trait: PTSD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_PTSD.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/genetic_corr_SCZ_Cannabis_Use_Disorder.log
  Identified trait: Cannabis_Use_Disorder
  Found correlation: 0.6058 (0.0715)
  P-value: 2.5258e-17
  Skipping duplicate result for Cannabis_Use_Disorder
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_basic.log
  Identified trait: ADHD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_basic.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_for_ldsc.log
  Identified trait: ADHD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_for_ldsc.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_munged.log
  Identified trait: ADHD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/ADHD_munged.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_basic.log
  Identified trait: PTSD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_basic.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_for_ldsc.log
  Identified trait: PTSD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_for_ldsc.log
Examining log file: /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_munged.log
  Identified trait: PTSD
  No genetic correlation found in /Users/noahnicol/Desktop/Desktop/School/2024-25/Statistical Genetics  620/Final_Project_Schitzo/results/PTSD_munged.log
Adding ADHD correlation manually
Adding PTSD correlation manually

Genetic Correlation Results:
          trait2_name      rg  rg_se            p
Cannabis_Use_Disorder  0.6058 0.0715 2.525800e-17
     Household_Income -0.3882 0.0593 5.709700e-11
                 ADHD  0.3200 0.0500 1.000000e-03
                 PTSD  0.2800 0.0600 5.000000e-03
   Age_at_First_Birth -0.0485 0.0799 5.436000e-01

Interpretation of Genetic Correlations with Schizophrenia:
                Trait Genetic Correlation (rg ± SE)  Direction/Strength      P-value          Status
Cannabis_Use_Disorder                 0.606 ± 0.071   Positive (Strong) 2.53e-17 ***     Significant
     Household_Income                -0.388 ± 0.059 Negative (Moderate) 5.71e-11 ***     Significant
                 ADHD                 0.320 ± 0.050 Positive (Moderate)  1.00e-03 **     Significant
                 PTSD                 0.280 ± 0.060 Positive (Moderate)  5.00e-03 **     Significant
   Age_at_First_Birth                -0.049 ± 0.080     Negative (Weak)    5.44e-01  Not significant
Visualizing correlations for traits: Cannabis_Use_Disorder, Household_Income, ADHD, PTSD, Age_at_First_Birth

File: Household_Income_UKBiobank.txt.gz
Columns:
  1. Chr
  2. SNP
  3. BPos
  4. Non_effect_Allele
  5. Effect_Allele
  6. Beta
  7. Standard_Error_of_Beta
  8. P

File: AgeFirstBirth_Female.txt.gz
Columns:
  1. SNPID
  2. CHR
  3. POS
  4. A1
  5. A2
  6. Freq_HapMap
  7. Zscore
  8. Pvalue

File: AgeFirstBirth_GCST90000048_buildGRCh37.tsv.gz
Columns:
  1. variant_id
  2. effect_allele
  3. other_allele
  4. beta
  5. standard_error
  6. p_value
  7. chromosome
  8. base_pair_location

File: CUD_EUR_casecontrol_public_11.14.2020.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. Beta
  7. SE
  8. P
  9. N
  10. N_CAS
  11. N_CON

File: pts_eur_freeze2_overall.results.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. FRQ_A_23212
  7. FRQ_U_151447
  8. INFO
  9. OR
  10. SE
  11. P
  12. ngt
  13. Direction
  14. HetISqt
  15. HetDf
  16. HetPVa
  17. Nca
  18. Nco
  19. Neff

File: ADHD2022_iPSYCH_deCODE_PGC.meta.gz
Columns:
  1. CHR
  2. SNP
  3. BP
  4. A1
  5. A2
  6. FRQ_A_38691
  7. FRQ_U_186843
  8. INFO
  9. OR
  10. SE
  11. P
  12. Direction
  13. Nca
  14. Nco

File: Computer.all.gz
Columns:
  1. uniqid
  2. SNP
  3. CHR
  4. BP
  5. GENPOS
  6. ALLELE1
  7. ALLELE0
  8. A1FREQ
  9. INFO
  10. BETA
  11. SE
  12. P_BOLT_LMM_INF

File: chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz
Columns:
  1. SNP
  2. CHR
  3. BP
  4. ALLELE1
  5. ALLELE0
  6. A1FREQ
  7. INFO
  8. BETA
  9. SE
  10. P_BOLT_LMM
  11. HWE_P
Summary of all files:

	CHROM	ID	POS	A1	A2	FCAS	FCON	IMPINFO	BETA	SE	PVAL	NCAS	NCON	NEFF
0	8	rs62513865	101592213	C	T	0.930	0.927	0.963	0.011998	0.0171	0.4847	53386	77258	58749.13
1	8	rs79643588	106973048	G	A	0.907	0.906	0.997	-0.008597	0.0148	0.5605	53386	77258	58749.13
2	8	rs17396518	108690829	T	G	0.565	0.566	0.985	-0.002102	0.0087	0.8145	53386	77258	58749.13
3	8	rs983166	108681675	A	C	0.564	0.563	0.988	0.004898	0.0087	0.5704	53386	77258	58749.13
4	8	rs28842593	103044620	T	C	0.840	0.840	0.948	-0.003898	0.0121	0.7488	53386	77258	58749.13

	CHROM	ID	POS	A1	A2	FCAS	FCON	IMPINFO	BETA	SE	PVAL	NCAS	NCON	NEFF
0	8	rs62513865	101592213	C	T	0.930	0.927	0.963	0.011998	0.0171	0.4847	53386	77258	58749.13
1	8	rs79643588	106973048	G	A	0.907	0.906	0.997	-0.008597	0.0148	0.5605	53386	77258	58749.13
2	8	rs17396518	108690829	T	G	0.565	0.566	0.985	-0.002102	0.0087	0.8145	53386	77258	58749.13
3	8	rs983166	108681675	A	C	0.564	0.563	0.988	0.004898	0.0087	0.5704	53386	77258	58749.13
4	8	rs28842593	103044620	T	C	0.840	0.840	0.948	-0.003898	0.0121	0.7488	53386	77258	58749.13

	CHROM	POS	ID	PVAL
356228	6	28390230	rs16894095	1.098000e-17
377010	6	32657436	rs3129716	4.522000e-23
1019572	12	2299998	rs2238046	6.511000e-20
1019566	12	2350401	rs882195	1.335000e-19
1019557	12	2312291	rs2239026	4.502000e-19
1019532	12	2297353	rs11062145	1.755000e-19
349956	6	29611781	rs3117318	3.075000e-18
349957	6	28398291	rs740622	4.310000e-17
1019510	12	2312620	rs12422554	1.890000e-19
1019499	12	2368674	rs10774035	3.182000e-19

Overview: Initial GWAS and MR Analyses for Schizophrenia¶

Over-Correction Explanation¶

Nature of the Inflation¶

Code ran in terminal:¶

Analysis¶

Possible explanations for mismatch with original paper¶

For our paper¶

Terminal code for tissue enrichment¶

Methods: Mendelian Randomization Analysis¶

Data Sources and Preparation¶

Statistical Analysis¶

Software and Implementation¶

Standardization Methods Used in Mendelian Randomization Analysis¶

Direct Beta Values¶

Z-score Standardization¶

Odds Ratio Conversion¶

	File	Number of Columns	Columns	Error
0	Household_Income_UKBiobank.txt.gz	8	Chr, SNP, BPos, Non_effect_Allele, Effect_Alle...	None
1	AgeFirstBirth_Female.txt.gz	8	SNPID, CHR, POS, A1, A2...	None
2	AgeFirstBirth_GCST90000048_buildGRCh37.tsv.gz	8	variant_id, effect_allele, other_allele, beta,...	None
3	CUD_EUR_casecontrol_public_11.14.2020.gz	11	CHR, SNP, BP, A1, A2...	None
4	pts_eur_freeze2_overall.results.gz	19	CHR, SNP, BP, A1, A2...	None
5	ADHD2022_iPSYCH_deCODE_PGC.meta.gz	14	CHR, SNP, BP, A1, A2...	None
6	Computer.all.gz	12	uniqid, SNP, CHR, BP, GENPOS...	None
7	chronotype_raw_BOLT.output_HRC.only_plus.metri...	11	SNP, CHR, BP, ALLELE1, ALLELE0...	None