import os
import argparse
import subprocess
import time
import threading
import sys
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import shutil

# List of pipeline steps + check
PIPELINE_STEPS = [
    "1_Run_zol_single_folder.py",
    "3_Zol_to_json_single.py",
    "4_GbkInfo_json_single.py",
    "AstralV2_single.py",
    "9_gbk_vis_single.py",
    "heap_calculation_single.py",
    "user_vis_single.py"
]

# Step display names (shorter for terminal output)
STEP_NAMES = [
    "Zol_Run",
    "Zol_to_JSON",
    "GBK_Info",
    "ASTRAL",
    "GBK_Vis",
    "Heaps_Law",
    "User_Vis"
]

# Step arguments templates
STEP_ARGS = {
    "1_Run_zol_single_folder.py": ["-i", "{input_dir}/{folder}", "-o", "{output_dir}/{folder}", "-log", "{log_dir}/{folder}_{step}.log", "-c", "{cores}"],
    "3_Zol_to_json_single.py": ["-dir", "{output_dir}/{folder}", "-c", "{cores}"],
    "4_GbkInfo_json_single.py": ["-dir", "{output_dir}/{folder}", "-c", "{cores}"],
    "AstralV2_single.py": ["-i", "{output_dir}/{folder}", "-c", "{cores}", "-al", "{astral_path}"],
    "9_gbk_vis_single.py": ["-i", "{output_dir}/{folder}", "-c", "{cores}"],
    "heap_calculation_single.py": ["-dir", "{output_dir}/{folder}", "-c", "{cores}"],
    "user_vis_single.py": ["-i", "{output_dir}/{folder}", "-c", "{cores}"]
}

# Global variables for status tracking
folder_status = {}  # Tracks current step for each folder
step_counts = {}    # Counts folders at each step
steps_completed = {} # Tracks how many folders have completed each step
completed_count = 0  # Count of completely finished folders
failed_count = 0     # Count of failed folders
status_lock = threading.Lock()  # Lock for updating status
error_log_lock = threading.Lock()  # Lock for writing to error log

# Terminal display lock
display_lock = threading.Lock()

def format_args(arg_template, folder, input_dir, output_dir, log_dir, cores, astral_path, step):
    """Format command arguments by replacing placeholders."""
    args = []
    for arg in arg_template:
        args.append(arg.format(
            folder=folder,
            input_dir=input_dir,
            output_dir=output_dir,
            log_dir=log_dir,
            cores=cores,
            astral_path=astral_path,
            step=step
        ))
    return args

def display_status(total_folders):
    """Display current status in the terminal."""
    with display_lock:
        # Clear screen
        os.system('cls' if os.name == 'nt' else 'clear')
        
        print("=== PanBGC Pipeline Status ===")
        print(f"Total Folders: {total_folders}")
        
        # Display counts for each step
        for i, step_name in enumerate(STEP_NAMES):
            current_count = max(0, step_counts.get(i, 0))  # Ensure we don't show negative counts
            completed = steps_completed.get(i, 0)
            print(f"{step_name}: {current_count}/{total_folders} processing, {completed}/{total_folders} completed")
        
        # Display completed/failed
        print(f"Complete Pipeline: {completed_count}/{total_folders} folders")
        if failed_count > 0:
            print(f"Failed: {failed_count}/{total_folders} folders")
        
        # Overall progress
        progress = completed_count + failed_count
        progress_pct = (progress / total_folders) * 100 if total_folders > 0 else 0
        print(f"Overall Progress: {progress}/{total_folders} ({progress_pct:.1f}%)")
        print("=" * 50)

def get_last_lines(file_path, n=10):
    """Get the last n lines from a file."""
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            return lines[-n:] if len(lines) >= n else lines
    except Exception as e:
        return [f"Error reading file: {e}"]

def write_error_to_log(output_dir, folder, error_info):
    """Write a single error to the error log file."""
    error_file = os.path.join(output_dir, "error.txt")
    
    # Use a lock to prevent multiple threads from writing simultaneously
    with error_log_lock:
        # Check if the file exists to determine if we need a header
        file_exists = os.path.exists(error_file)
        
        # Open in append mode
        with open(error_file, 'a') as f:
            # Write header if this is a new file
            if not file_exists:
                f.write("PanBGC Pipeline Error Log\n")
                f.write("========================\n\n")
            
            # Write this error's information
            f.write(f"Folder: {folder}\n")
            f.write(f"Failed Step: {error_info['step']}\n")
            f.write(f"Command: {error_info['command']}\n")
            f.write(f"Log File: {error_info['log_file']}\n")
            f.write("Error Details:\n")
            f.write("--------------\n")
            f.write(error_info['error_details'])
            f.write("\n\n" + "="*50 + "\n\n")

def run_step(script_path, folder, step_index, input_dir, output_dir, log_dir, cores, astral_path):
    """Run a single step of the pipeline for a folder."""
    global folder_status, step_counts, steps_completed, completed_count, failed_count
    
    step_script = os.path.basename(script_path)
    step_name = STEP_NAMES[step_index]
    
    # Do NOT update status here - the folder should already be marked as at this step
    # by the process_folder function. Just update the display in case other folders have changed.
    with status_lock:
        display_status(len(folder_status))
    
    # Prepare log directory
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"{folder}_{step_name}.log")
    
    # Prepare arguments
    args = format_args(STEP_ARGS[step_script], folder, input_dir, output_dir, log_dir, cores, astral_path, step_name)
    
    # Run the command
    cmd = ["python", script_path] + args
    cmd_str = " ".join(cmd)
    
    try:
        # Log the command
        with open(log_file, 'w') as log:
            log.write(f"Running command: {cmd_str}\n\n")
        
        # Run with output to log file
        with open(log_file, 'a') as log:
            process = subprocess.run(cmd, stdout=log, stderr=log)
            success = process.returncode == 0
        
        # If not successful, capture error information
        if not success:
            last_lines = get_last_lines(log_file)
            error_info = {
                'step': step_name,
                'command': cmd_str,
                'log_file': log_file,
                'error_details': ''.join(last_lines)
            }
            # Write to error log immediately
            write_error_to_log(output_dir, folder, error_info)
        
        # Update completion status when step finishes
        with status_lock:
            # Decrement current step count only if this folder is still at this step
            if folder in folder_status and folder_status[folder] == step_index:
                step_counts[step_index] = max(0, step_counts.get(step_index, 0) - 1)
            
            # Increment completed step count
            steps_completed[step_index] = steps_completed.get(step_index, 0) + 1
            
            # Update folder to next step if successful
            if success and folder in folder_status and folder_status[folder] == step_index:
                folder_status[folder] = step_index + 1
                next_step = step_index + 1
                if next_step < len(PIPELINE_STEPS):
                    step_counts[next_step] = step_counts.get(next_step, 0) + 1
            
            display_status(len(folder_status))
            
        return success
    except Exception as e:
        # Log any error
        with open(log_file, 'a') as log:
            error_msg = f"\nUnexpected ERROR: {e}\n"
            log.write(error_msg)
        
        # Capture error information
        error_info = {
            'step': step_name,
            'command': cmd_str,
            'log_file': log_file,
            'error_details': f"Exception occurred: {str(e)}"
        }
        # Write to error log immediately
        write_error_to_log(output_dir, folder, error_info)
        
        # Update status on error
        with status_lock:
            # Only decrement if this folder is still at this step
            if folder in folder_status and folder_status[folder] == step_index:
                step_counts[step_index] = max(0, step_counts.get(step_index, 0) - 1)
            display_status(len(folder_status))
            
        return False

def process_folder(folder, input_dir, output_dir, log_dir, cores, astral_path):
    """Process a folder through all pipeline steps."""
    global folder_status, step_counts, steps_completed, completed_count, failed_count
    
    script_dir = os.path.dirname(os.path.realpath(__file__))
    
    # Get paths for all scripts
    script_paths = [os.path.join(script_dir, script) for script in PIPELINE_STEPS]
    
    # Mark this folder as starting with the first step
    with status_lock:
        # Set to first step before processing starts
        folder_status[folder] = 0  # First step index
        step_counts[0] = step_counts.get(0, 0) + 1
        display_status(len(folder_status))
    
    # Process each step
    for i, script_path in enumerate(script_paths):
        # The run_step function now handles updating to the next step
        success = run_step(script_path, folder, i, input_dir, output_dir, log_dir, cores, astral_path)
        
        # If a critical step fails, stop processing this folder
        # Added Heaps Law calculation (step 5) to the list of critical steps
        is_critical = i in [0, 1, 2, 4, 5]  # Steps 0, 1, 2, 4, and 5 are critical
        if not success and is_critical:
            # Mark folder as failed
            with status_lock:
                if folder in folder_status:
                    current_step = folder_status[folder]
                    if current_step >= 0 and current_step < len(PIPELINE_STEPS):
                        step_counts[current_step] = max(0, step_counts.get(current_step, 0) - 1)
                folder_status[folder] = -2  # -2 means failed
                failed_count += 1
                display_status(len(folder_status))
            
            # Report result
            return folder, "FAILED", i
    
    # Folder completed successfully
    with status_lock:
        if folder in folder_status:
            current_step = folder_status[folder]
            if current_step >= 0 and current_step < len(PIPELINE_STEPS):
                step_counts[current_step] = max(0, step_counts.get(current_step, 0) - 1)
        folder_status[folder] = len(PIPELINE_STEPS)  # Beyond the last step
        completed_count += 1
        display_status(len(folder_status))
    
    # Report result
    return folder, "SUCCESS", len(PIPELINE_STEPS)

def initialize_status_counts(total_folders):
    """Initialize the status counters for all steps."""
    global step_counts, steps_completed
    for i in range(len(PIPELINE_STEPS)):
        step_counts[i] = 0
        steps_completed[i] = 0

def check_for_gbk_files(directory):
    """Check if directory contains .gbk files directly"""
    return any(file.endswith('.gbk') for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file)))

def prepare_single_folder(input_dir, temp_folder_name="single_family"):
    """
    Prepare a single folder with all .gbk files if they are present directly in the input directory.
    
    Args:
        input_dir (str): Input directory path
        temp_folder_name (str): Name of the temporary folder to create
        
    Returns:
        str: The folder name that was created, or None if no .gbk files were found
    """
    # Check if there are .gbk files directly in the input directory
    gbk_files = [f for f in os.listdir(input_dir) if f.endswith('.gbk') and os.path.isfile(os.path.join(input_dir, f))]
    
    if not gbk_files:
        return None
    
    # Create a folder for the .gbk files
    single_folder_path = os.path.join(input_dir, temp_folder_name)
    os.makedirs(single_folder_path, exist_ok=True)
    
    # Copy all .gbk files to the new folder
    for gbk_file in gbk_files:
        src_path = os.path.join(input_dir, gbk_file)
        dst_path = os.path.join(single_folder_path, gbk_file)
        shutil.copy2(src_path, dst_path)
    
    print(f"Created {temp_folder_name} folder with {len(gbk_files)} .gbk files")
    return temp_folder_name

def main():
    global completed_count, failed_count
    
    parser = argparse.ArgumentParser(description="Run PanBGC pipeline with parallel folder processing.")
    parser.add_argument("-i", "--input", required=True, help="Input directory containing folders or .gbk files.")
    parser.add_argument("-o", "--output", required=True, help="Output directory for results.")
    parser.add_argument("-log", "--logdir", required=True, help="Directory for log files.")
    parser.add_argument("-c", "--cores", required=True, type=int, help="Number of folders to process in parallel.")
    parser.add_argument("-al", "--align", required=True, help="Path to ASTRAL alignment tool.")
    args = parser.parse_args()
    
    # Ensure directories exist
    os.makedirs(args.output, exist_ok=True)
    os.makedirs(args.logdir, exist_ok=True)
    
    # Check if the input directory contains .gbk files directly
    has_gbk_files = check_for_gbk_files(args.input)
    
    # Get list of folders to process
    if has_gbk_files:
        print("Detected .gbk files directly in the input directory")
        # Create a single folder with all .gbk files
        single_folder = prepare_single_folder(args.input)
        if single_folder:
            folders = [single_folder]
            print(f"Will process a single folder: {single_folder}")
        else:
            print("Error: Failed to create a folder for .gbk files")
            return
    else:
        # Get all subdirectories
        folders = [d for d in os.listdir(args.input) 
                  if os.path.isdir(os.path.join(args.input, d)) and not d.startswith('.')]
    
    if not folders:
        print(f"No folders or .gbk files found in {args.input}")
        return
    
    # Initialize status counters
    initialize_status_counts(len(folders))
    
    # Initialize folder status
    for folder in folders:
        folder_status[folder] = -1  # -1 means not started
    
    # Clear existing error log file if it exists
    error_file = os.path.join(args.output, "error.txt")
    if os.path.exists(error_file):
        os.remove(error_file)
    
    # Start time
    start_time = time.time()
    
    # Process folders in parallel
    results = []
    
    with ThreadPoolExecutor(max_workers=args.cores) as executor:
        # Submit all folders for processing
        future_to_folder = {
            executor.submit(
                process_folder, 
                folder, 
                args.input, 
                args.output, 
                args.logdir, 
                1,  # Each process gets 1 core
                args.align
            ): folder for folder in folders
        }
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_folder):
            folder = future_to_folder[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error processing {folder}: {e}")
                
                # Record the error immediately
                error_info = {
                    'step': 'Unknown',
                    'command': 'N/A',
                    'log_file': 'N/A',
                    'error_details': f"Unhandled exception: {str(e)}"
                }
                write_error_to_log(args.output, folder, error_info)
                
                results.append((folder, "ERROR", -1))
    
    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    
    # Final status display
    display_status(len(folders))
    
    # Print summary
    print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
    print(f"Error details logged to: {os.path.join(args.output, 'error.txt')}")
    
    # Write summary to file
    summary_file = os.path.join(args.output, "pipeline_summary.txt")
    with open(summary_file, 'w') as f:
        f.write("PanBGC Pipeline Summary\n")
        f.write("======================\n\n")
        f.write(f"Total folders: {len(folders)}\n")
        f.write(f"Completed successfully: {completed_count}\n")
        f.write(f"Failed: {failed_count}\n")
        f.write(f"Processing time: {elapsed_time:.2f} seconds\n\n")
        
        f.write("Folder\tStatus\tFailed Step\n")
        for folder, status, step_idx in sorted(results):
            failed_step = STEP_NAMES[step_idx] if status == "FAILED" and step_idx >= 0 else "N/A"
            f.write(f"{folder}\t{status}\t{failed_step}\n")
    
    print(f"Summary written to {summary_file}")

if __name__ == "__main__":
    main()