# This script convert MAP format sequences to normal natural sequence in FASTA and CSV format. 
# It takes first column that contain {} this pattern and remove this pattern
# 
print('######################################################################################################')
print('############## You are using Modification and Annotation in Proteins (MAP) Script ####################')
print('##################### MAP Program, developed by Prof G. P. S. Raghava group. #########################')
print('############ Please cite: MAP; available at https://webs.iiitd.edu.in/raghava/maprepo/  ##############')
print('######################################################################################################')
 

import pandas as pd
import re
import argparse


# ----------- Argument Parser -----------
parser = argparse.ArgumentParser(description="Clean tagged sequences and export to CSV or FASTA.")
parser.add_argument("-i", "--input", type=str, required=True, help="Input CSV file")
parser.add_argument("-o", "--output", type=str, default="Cleaned_Sequences", help="Base name for output files")
parser.add_argument("-f", "--format", type=str, choices=['f', 'c'], required=True, help="Output format: 'f' for FASTA, 'c' for CSV")
parser.add_argument("-org", "--org", type=str, default="undefine", help="Organism name for FASTA header")
parser.add_argument("-func", "--func", type=str, default="unknown", help="Function for FASTA header")
parser.add_argument("--prefix", type=str, default="Sample", help="Header prefix for FASTA entries")

args = parser.parse_args()

# ----------- Load CSV and Detect Header -----------

# Load first row to inspect
first_row = pd.read_csv(args.input, header=None, nrows=1)
has_header = not first_row.iloc[0].astype(str).str.contains(r'\{.*?\}').any()

# Load full CSV
if has_header:
    df = pd.read_csv(args.input)
else:
    df = pd.read_csv(args.input, header=None)
    df.columns = ['MAP_seq']

# ----------- Detect First Column with {tags} -----------

candidate_col = None
for col in df.columns:
    if df[col].astype(str).str.contains(r'\{.*?\}').any():
        candidate_col = col
        break

if not candidate_col:
    raise ValueError("❌ No column with {tags} found in input CSV.")

# ----------- Clean Tagged Sequences -----------
df['Clean Sequence'] = df[candidate_col].apply(
    lambda seq: re.sub(r'\{.*?\}', '', seq) if isinstance(seq, str) else "ERROR"
)

# ----------- Export Cleaned CSV -----------
if args.format == "c":
    df.to_csv(f"{args.output}.csv", index=False)
    print(f"✅ Cleaned sequences saved to: {args.output}.csv")

# ----------- Export Cleaned FASTA -----------
elif args.format == "f":
    fasta_lines = []
    for i, seq in enumerate(df['Clean Sequence']):
        header = f">{args.prefix}_{i+1} {{org:{args.org}}} {{func:{args.func}}}"
        fasta_lines.append(header)
        fasta_lines.append(seq)

    with open(f"{args.output}.fasta", "w") as f:
        f.write("\n".join(fasta_lines))

    print(f"✅ FASTA file saved as: {args.output}.fasta")

