import pandas as pd
import json
import os
import re
import argparse

def process_single_directory(dir_path):
    """
    Process a single directory to convert Excel results to JSON.
    
    Args:
        dir_path (str): Path to the directory to process
    
    Returns:
        bool: True if successful, False otherwise
    """
    # Check if the directory exists and is a directory
    if not os.path.exists(dir_path) or not os.path.isdir(dir_path):
        print(f"{dir_path} does not exist or is not a directory.")
        return False
    
    # Define paths
    file_path = os.path.join(dir_path, "Final_Results", "Consolidated_Report.xlsx")
    input_genbank_dir = os.path.join(dir_path, "Local_Modified_GenBanks")
    
    # Count the number of files in the inputGenBank folder
    if os.path.exists(input_genbank_dir):
        total_bgcs = len([f for f in os.listdir(input_genbank_dir) 
                         if os.path.isfile(os.path.join(input_genbank_dir, f))])
        print(f"Found {total_bgcs} files in {input_genbank_dir}")
    else:
        print(f"Local_Modified_GenBanks directory not found: {input_genbank_dir}")
        total_bgcs = 0  # If the directory doesn't exist, set total_BGCs to 0
    
    # Check if the Excel file exists
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.")
        return False
    
    try:
        # Load the Excel file
        print(f"Processing file: {file_path}")
        excel_data = pd.ExcelFile(file_path)

        # Load the 'ZoL Results' sheet into a DataFrame
        zol_results_df = pd.read_excel(file_path, sheet_name='ZoL Results')

        # Function to count the number of entries in "CDS locus tags" for each row
        def count_cds_locus_tags(row):
            if pd.isna(row):
                return 0
            return len(str(row).split(';'))

        # Check if 'CDS Locus Tags' column exists, if not print a message
        if 'CDS Locus Tags' in zol_results_df.columns:
            # Add a new column "BGC_count" with the count of entries in "CDS locus tags"
            zol_results_df['BGC_count'] = zol_results_df['CDS Locus Tags'].apply(count_cds_locus_tags)
        else:
            print("'CDS Locus Tags' column not found in the sheet.")
            zol_results_df['BGC_count'] = 0  # Optionally add a default value

        # Function to sanitize column names (replace spaces with underscores, remove parentheses, etc.)
        def sanitize_column_name(col_name):
            return re.sub(r"[()\'-?]", "", col_name).replace(" ", "_")

        # Apply the sanitization to the DataFrame's column names
        zol_results_df.columns = [sanitize_column_name(col) for col in zol_results_df.columns]

        # Function to extract the numeric part from "Ortholog_Group_OG_ID"
        def extract_og_number(og_id):
            match = re.search(r'OG_(\d+)', str(og_id))
            if match:
                return int(match.group(1))
            return og_id

        # Apply the function to modify "Ortholog_Group_OG_ID" values
        if 'Ortholog_Group_OG_ID' in zol_results_df.columns:
            zol_results_df['Ortholog_Group_OG_ID'] = zol_results_df['Ortholog_Group_OG_ID'].apply(extract_og_number)
        else:
            print("'Ortholog_Group_OG_ID' column not found in the sheet.")

        # Function to clean up "OG_Consensus_Direction" to keep only "+" or "-"
        def clean_og_consensus_direction(direction):
            direction = str(direction).strip("\"")  # Remove surrounding quotes
            if direction in ['+', '-']:
                return direction
            return direction  # In case there are other characters, return as is (optional)

        # Apply the function to "OG_Consensus_Direction"
        if 'OG_Consensus_Direction' in zol_results_df.columns:
            zol_results_df['OG_Consensus_Direction'] = zol_results_df['OG_Consensus_Direction'].apply(clean_og_consensus_direction)
        else:
            print("'OG_Consensus_Direction' column not found in the sheet.")

        # Replace NaN/None values with "-"
        zol_results_df = zol_results_df.fillna("-")

        # Convert the DataFrame to a list of dictionaries
        zol_results_list = zol_results_df.to_dict(orient='records')

        # Add the total_BGCs value to each dictionary
        for record in zol_results_list:
            record['total_BGCs'] = total_bgcs

        # Make sure the output directory exists
        os.makedirs(os.path.dirname(os.path.join(dir_path, "Final_Results")), exist_ok=True)
        
        # Save the list of dictionaries to a JSON file
        json_file_path = os.path.join(dir_path, "Final_Results", "Report.json")
        with open(json_file_path, 'w') as json_file:
            json.dump(zol_results_list, json_file, indent=4)

        print(f'JSON file saved to {json_file_path}')
        return True
        
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")
        return False

def parse_arguments():
    parser = argparse.ArgumentParser(description="Process Excel file in a single directory and save the results as JSON.")
    parser.add_argument('-dir', '--directory', required=True, help='Directory to process.')
    parser.add_argument('-c', '--cores', required=True, type=int, help='Number of cores (not used for single directory).')
    
    return parser.parse_args()

if __name__ == "__main__":
    # Get command-line arguments
    args = parse_arguments()
    
    directory_path = args.directory
    
    # Process the single directory
    print(f"Processing directory: {directory_path}")
    success = process_single_directory(directory_path)
    
    if success:
        print(f"Successfully processed directory: {directory_path}")
    else:
        print(f"Failed to process directory: {directory_path}")
        sys.exit(1)  # Exit with error code if failed
