#!/usr/bin/env python
# See the LICENSE file included with this software for license information.

import os, sys, string, getopt, random,subprocess, time,operator, math, datetime,numpy #pysam
from collections import defaultdict
import csv
import shutil
from tempfile import TemporaryDirectory
import re
import logging
from logger import logger
import multiprocessing
import argparse
import signal
import inspect
from multiprocessing import *
from Bio import SeqIO
from glob import glob

import extend as ext

__version__ = "1.7.4"
reroot_tree = True #use --midpoint-reroot

try:
    import dendropy
except ImportError:
    reroot_tree = False

#check for sane file names
ALIGNER_TO_IDX = {
        "mafft": "1",
        "muscle": "2",
        "fsa": "3",
        "prank": "4"
}

VERBOSE = 0
VERSION = __version__
PHI_WINDOWSIZE = 1000
TOTSEQS=0
PARSNP_DIR = sys.path[0]




########################################### Environment ############################################
try:
    os.environ["PARSNPDIR"]
    PARSNP_DIR = os.environ["PARSNPDIR"]
except KeyError:
    PARSNP_DIR = sys.path[0]
SIGINT = False

try:
    os.environ["PYTHONPATH"] = PARSNP_DIR + os.pathsep + os.environ["PYTHONPATH"]
except KeyError:
    os.environ["PYTHONPATH"] = PARSNP_DIR + os.pathsep

frozenbinary = True
application_path = ""
if getattr(sys, 'frozen', False):
    application_path = os.path.dirname(sys.executable)
elif __file__:
    application_path = os.path.dirname(__file__)
    frozenbinary = False

if frozenbinary:
   utilPath = PARSNP_DIR
   libPath = os.path.abspath(os.path.join(utilPath, "..", "lib"))
   if os.path.exists(libPath):
      oldLDPath = ""
      needToAdd = True
      if "LD_LIBRARY_PATH" in os.environ:
          oldLDPath = os.environ["LD_LIBRARY_PATH"]
          if libPath in oldLDPath:
              needToAdd = False
      elif "DYLD_FALLBACK_LIBRARY_PATH" in os.environ:
         oldLDPath = os.environ["DYLD_FALLBACK_LIBRARY_PATH"]
         if libPath in oldLDPath:
            needToAdd = False
      if needToAdd:
         os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = libPath + os.pathsep + oldLDPath
         os.environ["LD_LIBRARY_PATH"] = libPath + os.pathsep + oldLDPath

# Add binaries to path
# os.environ["PATH"] += os.pathsep + os.path.join(PARSNP_DIR, "bin")

OSTYPE="linux"
p = subprocess.Popen("echo `uname`", shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(checkStdout, checkStderr) = p.communicate()
if checkStderr != b"":
    sys.stderr.write(WARNING_YELLOW+"Warning: Cannot determine OS, defaulting to %s\n"%(OSTYPE)+ENDC)
else:
    OSTYPE = checkStdout.decode('utf-8').strip()

binary_type = "linux"
if OSTYPE == "Darwin":
    binary_type = "osx"
else:
    binary_type = "linux"

####################################################################################################


######################################## Utility Functions #########################################
def get_os():
    p = subprocess.Popen(
            "echo `uname`",
            shell=True,
            stdin=None,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
    (checkStdout, checkStderr) = p.communicate()
    if checkStderr != b'':
        OSTYPE = "Linux"
        logger.warning("Cannot determine OS, defaulting to %s"%(OSTYPE))
    else:
        OSTYPE = checkStdout.decode('utf-8').strip()
    if OSTYPE == "Darwin":
        binary_type = "osx"
    else:
        binary_type = "linux"
    return OSTYPE, binary_type


def handler(signum, frame):
    global SIGINT
    SIGINT = True
    logger.critical('Caught request to terminate by user (CTRL+C), exiting now, bye')
    sys.exit(128)

signal.signal(signal.SIGINT, handler)


#TODO Merge run fns
def run_phipack(query,seqlen,workingdir):
    currdir = os.getcwd()
    os.chdir(workingdir)
    command = "Profile -o -v -n %d -w 100 -m 100 -f %s > %s.out"%(seqlen,query,query)
    run_command(command,1)
    os.chdir(currdir)

def run_fasttree(query,workingdir,recombination_sites):
    currdir = os.getcwd()
    os.chdir(workingdir)
    command = "fasttree -nt -quote -gamma -slow -boot 100 seq.fna > out.tree"
    run_command(command,1)
    os.chdir(currdir)


#TODO Merge wrappers
def parallelWrapper(params):
   try:
        jobID = params["jobID"]
        result = {}
        result["jobID"] = jobID
        result["status"] = 0
        run_mummer(params["ref"], params["query"], params["prefix"])
        result["status"] = 1
        return result
   except KeyboardInterrupt:
        result["status"] = 0
        logger.info("Keyboard interrupt in thread %d, quitting\n"%(jobID))
        return result
   except Exception:
        result["status"] = 0
        logger.info("Other error in thread %d, quitting\n"%(jobID))
        return result


def parallelFtWrapper(params):
   try:
        jobID = params["jobID"]
        result = {}
        result["jobID"] = jobID
        result["status"] = 0
        run_fasttree(params["query"], params["dir"], params["recombination"])
        result["status"] = 1
        return result
   except KeyboardInterrupt:
        result["status"] = 0
        logger.info("Keyboard interrupt in thread %d, quitting\n"%(jobID))
        return result
   except Exception:
        result["status"] = 0
        logger.info("Other error in thread %d, quitting\n"%(jobID))
        return result


def parallelPhiWrapper(params):
   try:
        jobID = params["jobID"]
        result = {}
        result["jobID"] = jobID
        result["status"] = 0
        if params["seqlen"] >= 1000:
            run_phipack(params["query"],params["seqlen"],params["dir"])
            result["status"] = 1
        else:
            result["status"] = 2
        return result
   except KeyboardInterrupt:
        result["status"] = 0
        logger.info("Keyboard interrupt in thread %d, quitting\n"%(jobID))
        return result
   except Exception:
        result["status"] = 0
        logger.info("Other error in thread %d, quitting\n"%(jobID))
        return result


def run_command(command,ignorerc=0):
   global SIGINT
   logger.debug(command)
   p = subprocess.Popen(command, shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE,close_fds=True,executable="/bin/bash")
   fstdout,fstderr = p.communicate()
   rc = p.returncode

   fstdout = fstdout.decode("utf-8")
   fstderr = fstderr.decode("utf-8")
   if rc != 0 and not SIGINT and not ignorerc and "rm " not in command and "ls " not in command and "unlink " not in command and "ln " not in command and "mkdir " not in command and "mv " not in command:
      logger.critical("""The following command failed:
      >>$ {}
      Please veryify input data and restart Parsnp.
      If the problem persists please contact the Parsnp development team.

      STDOUT:
      {}

      STDERR:
      {}""".format(command, fstdout, fstderr))

      sys.exit(rc)
   else:
      logger.debug(fstdout)
      logger.debug(fstderr)


def is_valid_file_path(parser, arg):
    if not os.path.exists(arg) and arg != "!" and arg != None and arg != "":
        logger.critical("The file %s does not exist!" % arg)
    else:
        return arg


def is_valid_dir(parser, arg):
    if not os.path.exists(arg):
        parser.error( "The directory %s does not exist\n" % (arg))
    if len(glob("%s/*"%(arg))) == 0:
        parser.error("The director %s is empty"%(arg))


def parse_args():
    parser = argparse.ArgumentParser(description="""
    Parsnp quick start for three example scenarios:
    1) With reference & genbank file:
    python Parsnp.py -g <reference_genbank_file1 reference_genbank_file2 ...> -d <seq_file1 seq_file2 ...>  -p <threads>

    2) With reference but without genbank file:
    python Parsnp.py -r <reference_genome> -d <seq_file1 seq_file2 ...> -p <threads>

    3) Autorecruit reference to a draft assembly:
    python Parsnp.py -q <draft_assembly> -d <seq_file1 seq_file2 ...> -p <threads>
    """, formatter_class=argparse.RawTextHelpFormatter)
    #TODO Use lambda to check files and directories
    input_output_args = parser.add_argument_group(title="Input/Output")
    input_output_args.add_argument(
        "-c",
        "--curated",
        action = "store_true",
        help = "(c)urated genome directory, use all genomes in dir and ignore MUMi?")
    input_output_args.add_argument(
        "-d",
        "--sequences",
        type = str,
        nargs = '+',
        required = True,
        help = "A list of files containing genomes/contigs/scaffolds")
    input_output_args.add_argument(
        "-r",
        "--reference",
        type = lambda fname: is_valid_file_path(parser, fname),
        default = "",
        help = "(r)eference genome (set to ! to pick random one from sequence dir)")
    #TODO Accept as space-separated input and parse automatically w/ argparse
    input_output_args.add_argument(
        "-g",
        "--genbank",
        nargs = '+',
        help = "A list of Genbank file(s) (gbk)")
    input_output_args.add_argument(
        "-o",
        "--output-dir",
        type = str,
        default = "[P_CURRDATE_CURRTIME]")
    input_output_args.add_argument(
        "-q",
        "--query",
        type = str,
        help = "Specify (assembled) query genome to use, in addition to genomes found in genome dir")

    MUMi_args = parser.add_argument_group(title="MUMi")
    MUMi_mutex_args = MUMi_args.add_mutually_exclusive_group()
    #TODO whats the default?
    MUMi_mutex_args.add_argument(
        "-U",
        "--max-mumi-distr-dist",
        "--MUMi",
        type = float,
        default = 0.5,
        help = "Max MUMi distance value for MUMi distribution")
    #TODO Not parsed in current parsnp version and had a duplicate -i flag. Is this no longer used?
    MUMi_mutex_args.add_argument(
        "-mmd",
        "--max-mumi-distance",
        type = float,
        help = "Max MUMi distance (default: autocutoff based on distribution of MUMi values)")
    MUMi_args.add_argument(
        "-F",
        "--fastmum",
        action = "store_true",
        help = "Fast MUMi calculation")
    MUMi_args.add_argument(
        "-M",
        "--mumi_only",
        "--onlymumi",
        action = "store_true",
        help = "Calculate MUMi and exit? overrides all other choices!")

    MUMi_rec_prog = MUMi_args.add_mutually_exclusive_group()
    # MUMi_rec_prog.add_argument(
        # "--use-mummer-mumi",
        # action = "store_true",
        # help = "Use mummer for MUMi distance genome recruitment")
    MUMi_rec_prog.add_argument(
        "--use-ani",
        action = "store_true",
        help = "Use ani for genome recruitment")
    MUMi_args.add_argument(
        "--min-ani",
        type = float,
        default = 90,
        help = "Min ANI value to allow for genome recruitment.")
    MUMi_rec_prog.add_argument(
        "--use-mash",
        action = "store_true",
        help = "Use mash for genome recruitment")
    MUMi_args.add_argument(
        "--max-mash-dist",
        type = float,
        default = .1,
        help = "Max mash distance.")

    MUM_search_args = parser.add_argument_group(title="MUM search")
    #new, default to lower, 12-17
    MUM_search_args.add_argument(
        "-a",
        "--min-anchor-length",
        "--anchorlength",
        type = str,
        default = "1.1*(Log(S))",
        help = "Min (a)NCHOR length (default = 1.1*(Log(S)))")
    MUM_search_args.add_argument(
        "-m",
        "--mum-length",
        "--mumlength",
        type = str,
        default = "1.1*(Log(S))",
        help = "Mum length")
    MUM_search_args.add_argument(
        "-C",
        "--max-cluster-d",
        "--clusterD",
        type = int,
        default = 300,
        help = "Maximal cluster D value")
    MUM_search_args.add_argument(
        "-z",
        "--min-cluster-size",
        "--minclustersize",
        type = int,
        default = 21,
        help = "Minimum cluster size")
    #TODO -z was a duplicate flag but no longer parsed as min-lcb-size in the current parsnp version
    # MUM_search_args.add_argument(
            # "-z",
            # "--min-lcb-size",
            # type = int,
            # default = 25,
            # help = "Min LCB si(z)e")

    LCB_alignment_args = parser.add_argument_group(title="LCB alignment")
    LCB_alignment_args.add_argument(
        "-D",
        "--max-diagonal-difference",
        "--DiagonalDiff",
        metavar = "MAX_DIAG_DIFF",
        type = str,
        default="0.12",
        help = "Maximal diagonal difference. Either percentage (e.g. 0.2) or bp (e.g. 100bp)")
    LCB_alignment_args.add_argument(
        "-n",
        "--alignment-program",
        "--alignmentprog",
        type = str,
        choices = list(ALIGNER_TO_IDX.keys()),
        default = "muscle",
        help = "Alignment program to use")
    LCB_alignment_args.add_argument(
        "-u",
        "--unaligned",
        action = "store_true",
        help = "Output unaligned regions")

    recombination_args = parser.add_argument_group("Recombination filtration")
    #TODO -x was a duplicate flag but no longer parsed as filter-phipack-snps in the current parsnp version
    # recombination_args.add_argument(
            # "-x",
            # "--filter-phipack-snps",
            # action = "store_true",
            # help = "Enable filtering of SNPs located in PhiPack identified regions of recombination")
    extend_args = parser.add_argument_group("LCB Extension")
    extend_args.add_argument(
        "--extend-lcbs",
        action="store_true",
        help="Extend the boundaries of LCBs with an ungapped alignment")
    extend_args.add_argument(
        "--extend-ani-cutoff",
        type=float,
        default=0.95,
        help="Cutoff ANI for lcb extension")
    extend_args.add_argument(
        "--extend-indel-cutoff",
        type=int,
        default=50,
        help="Cutoff for indels in LCB extension region. LCB extension will be at most min(seqs) + cutoff bases")
    extend_args.add_argument(
        "--match-score",
        type=float,
        default=5,
        help="Value of match score for extension")
    extend_args.add_argument(
        "--mismatch-penalty",
        type=float,
        default=-4,
        help="Value of mismatch score for extension (should be negative)")
    extend_args.add_argument(
        "--gap-penalty",
        type=float,
        default=-2,
        help="Value of gap penalty for extension (should be negative)")

    misc_args = parser.add_argument_group("Misc")
    misc_args.add_argument(
        "--skip-phylogeny",
        action="store_true",
        help="Do not generate phylogeny from core SNPs")
    misc_args.add_argument(
        "--validate-input",
        action="store_true",
        help="Use Biopython to validate input files")
    misc_args.add_argument(
        "--use-fasttree",
        action = "store_true",
        help = "Use fasttree instead of RaxML")
    misc_args.add_argument(
        "--vcf",
        action = "store_true",
        help = "Generate VCF file.")
    misc_args.add_argument(
        "-p",
        "--threads",
        type = int,
        default = 1,
        help = "Number of threads to use")
    misc_args.add_argument(
        "-P",
        "--max-partition-size",
        type = int,
        default = 15000000,
        help = "Max partition size (limits memory usage)")
    misc_args.add_argument(
        "-v",
        "--verbose",
        action = "store_true",
        help = "Verbose output")
    misc_args.add_argument(
        "-x",
        "--xtrafast",
        action = "store_true")
    misc_args.add_argument(
        "-i",
        "--inifile",
        "--ini-file",
        type = str)
    misc_args.add_argument(
        "-e",
        "--extend",
        action = "store_true")
    misc_args.add_argument(
        "-V",
        "--version",
        action = "version",
        version = "%(prog)s " + __version__)

    todo_args = parser.add_argument_group("Miscellaneous")
    # todo_args.add_argument(
        # "-l",
        # "--layout",
        # action = "store_true")
    # todo_args.add_argument(
        # "-s",
        # "--split",
        # action = "store_true",
        # help = "Split genomes by n's")
    return parser.parse_args()
####################################################################################################
#print("-g = <bool>: auto-launch (g)ingr? (default = NO)"


if __name__ == "__main__":
    t1 = time.time()
    logger.info(f"|--Parsnp {VERSION}--|\n")


    parsnp_dir= sys.path[0]
    #print parsnp_dir
    #PARSNP_DIR = parsnp_dir
    opts = []
    args = []

    OSTYPE, BINARY_TYPE = get_os()
    args = parse_args()
    currdir = os.getcwd()
    logging_level = logging.DEBUG if args.verbose else logging.INFO
    ref = args.reference
    randomly_selected_ref = False
    if ref == '!':
        randomly_selected_ref = True
    input_files = args.sequences
    query = args.query
    anchor = args.min_anchor_length
    #TODO I'm guessing mummer_mumi was intended to be an option?
    # use_mummer_mumi = args.use_mummer_mumi
    use_mummer_mumi = False
    use_ani = args.use_ani
    use_mash = args.use_mash
    use_fasttree = args.use_fasttree
    use_parsnp_mumi = not (use_mash or use_mummer_mumi or use_ani)
    mum = args.mum_length
    maxpartition = args.max_partition_size
    fastmum = args.fastmum
    cluster = args.max_cluster_d
    curated = args.curated
    aligner = ALIGNER_TO_IDX[args.alignment_program.lower()]
    threads = args.threads
    unaligned = "0" if not args.unaligned else "1"
    mincluster = args.min_cluster_size
    diagdiff = args.max_diagonal_difference
    # splitseq = args.split
    extend = args.extend
    # layout = args.layout
    xtrafast = args.xtrafast
    inifile = args.inifile
    inifile_exists = args.inifile is not None
    mumi_only = args.mumi_only
    mumidistance = args.max_mumi_distr_dist
    max_mash_dist = args.max_mash_dist
    min_ani_cutoff = args.min_ani
    outputDir = args.output_dir
    genbank_file = ""
    genbank_files = []
    genbank_files_cat = ""
    genbank_ref = ""
    reflen = 0
    use_gingr = ""
    generate_vcf = args.vcf
    filtreps = False

    repfile = ""
    multifasta = False
    ref_seqs = {}

    logger.setLevel(logging_level)
    for handler in logger.handlers:
        handler.setLevel(logging_level)

    # Check for dependencies
    missing = False
    for exe in ["Profile", "raxmlHPC-PTHREADS", "harvesttools"]:
        if shutil.which(exe) is None:
            missing = True
            logger.critical("{} not in system path!".format(exe))
    if use_fasttree:
        has_fasttree = False
        for exe in ["fasttree", "FastTree", "FastTreeMP"]:
            if shutil.which(exe) is not None:
                has_fasttree = True
        if not has_fasttree:
            logger.critical("No fasttree executable found in system path!".format(exe))
        missing = missing or (not has_fasttree)
    if missing:
        sys.exit(1)

    # Create output dir
    if outputDir == "." or outputDir == "./" or outputDir == "/":
        logger.critical("Specified output dir is current working dir or root dir! will clobber any parsnp.* results")
        sys.exit(1)
    elif outputDir == "[P_CURRDATE_CURRTIME]":
        today = datetime.datetime.now()
        timestamp = "P_"+today.isoformat().replace("-","_").replace(".","").replace(":","").replace("T","_")
        outputDir = os.getcwd()+os.sep+timestamp
    os.makedirs(outputDir, exist_ok=True)
    shutil.rmtree(os.path.join(outputDir, "tmp"), ignore_errors=True)
    os.makedirs(os.path.join(outputDir, "tmp"), exist_ok=True)

    input_files_processed = []
    for input_f in input_files:
        if os.path.isdir(input_f):
            for f in os.listdir(input_f):
                f = os.path.join(input_f, f)
                if os.path.isfile(f):
                    input_files_processed.append(f)
        elif os.path.isfile(input_f):
            input_files_processed.append(input_f)
        else:
            logger.error("{} is not a valid file".format(input_f))
    input_files = input_files_processed
    if len(input_files) < 2:
        logger.critical("Less than 2 input sequences provided...")
        sys.exit(1)
    for f in input_files + ([ref] if ref and ref != "!" else []):
        try:
            records = list(SeqIO.parse(f, "fasta"))
        except:
            logger.error("{} is an invalid sequence file!".format(f))
        if args.extend_lcbs and len(records) > 1:
            print(f)
            logger.error("Extending LCBs does not currently work with multi-contig inputs yet")
            sys.exit(1)
        for record in records:
            if any(c not in "GATCRYWSMKHBVDN" + "GATCRYWSMKHBVDN".lower() for c in record.seq):
                logger.error("Genome sequence {} has invalid characters {}! Skip!".format(f, set(str(record.seq)) - set("AGCTNagctn")))
                continue

    # Parse reference if necessary
    if ref and ref != "!":
        try:
            rf = open(ref, 'r')
            rfd = rf.read()
            refseqs = rfd.split(">")[1:]
            currpos = 0
            if len(refseqs) > 1:
                multifasta = True
                for seqnum, seq in enumerate(refseqs):
                    seq = seq.split('\n', 1)[1]
                    fastalen = len(seq) - seq.count('\n')
                    ref_seqs[currpos + fastalen] = seqnum
                    currpos += fastalen
            rf.close()
        except IOError as e:
            logger.critical(" Reference genome file %s not found\n"%(ref))
            sys.exit(1)

    # Validate genbank files
    #TODO Make this a function
    # return genbank_ref
    if args.genbank:
        genbank_files = args.genbank
        genbank_files_processed = []
        for genbank_f in genbank_files:
            if os.path.isdir(genbank_f):
                for f in os.listdir(genbank_f):
                    f = os.path.join(genbank_f, f)
                    if os.path.isfile(f):
                        genbank_files_processed.append(f)
            elif os.path.isfile(genbank_f):
                genbank_files_processed.append(genbank_f)
            else:
                logger.error("{} is not a valid file".format(genbank_f))
        genbank_files = genbank_files_processed
        if len(genbank_files) == 0:
            logger.critical("No valid genbank files provided...")
            sys.exit(1)
        ctcmd = "cat "

        first = True
        #genbank_ref = ""
        for genbank_file in genbank_files:
            if len(genbank_file) <= 1:
                continue
            ctcmd += genbank_file + " "
            genbank_ref = os.path.join(outputDir, "tmp", os.path.basename(genbank_file)+".fna")
            try:
                #parse out reference, starts at ORIGIN ends at //, remove numbers,
                rf = open(genbank_file,'r')
                genbank_ref_d = open(genbank_ref, "a+")
                while True:
                    giline = rf.readline()
                    if "VERSION" and "GI" in giline:
                        break
                    elif giline == None or giline == "":
                        logger.critical("Genbank file %s malformatted \n"%(genbank_file))
                        sys.exit(1)
                if len(giline) <= 2:
                    logger.critical("Genbank file %s malformatted \n"%(genbank_file))
                    sys.exit(1)
                genbank_ref_d.write(">gi|"+giline.split("GI:")[-1])
                first = False
                ntdata = False
                data = ""
                for line in rf:
                    if ntdata:
                        if "//" in line:
                            ntdata = False
                            break
                        data += line[9:].replace(" ","")
                    if "ORIGIN" in line:
                         ntdata = True

                rf.close()
                if len(data) < 10:
                      logger.critical("Genbank file %s contains no sequence data\n"%(genbank_file))
                      sys.exit(1)
                genbank_ref_d.write(data.upper())
                genbank_ref_d.close()
            except IOError as e:
                logger.critical("Genbank file %s not found\n"%(genbank_file))
                sys.exit(1)



    sortem = True
    ref_string = ref
    genome_string = ""
    if len(input_files) > 1:
        genome_string = "\n\t"
        if len(input_files) > 4:
            genome_string += "\n\t".join(input_files[:2])
            genome_string += "\n\t...{} more file(s)...\n\t".format(len(input_files) - 4)
            genome_string += "\n\t".join(input_files[-2:])
        else:
            genome_string += "\n\t".join(input_files)
    else:
        genome_string = input_files[0]
    if len(ref) == 0 and len(genbank_ref) != 0:
        #we are parsing from genbank, set ref to genbank_ref && turn off sorting
        ref = genbank_ref
        if len(genbank_files) > 1:
            ref_string = "\n\t"
            if len(genbank_files) > 4:
                ref_string += "\n\t".join(genbank_files[:2])
                ref_string += "\n\t...{} more file(s)...\n\t".format(len(genbank_files) - 4)
                ref_string += "\n\t".join(genbank_files[-2:])
            else:
                ref_string = "\n\t".join(genbank_files)
        else:
            ref_string += genbank_files[0]

        sortem = False

    autopick_ref = False
    if (not ref and not query) or not input_files:
        logger.critical("No seqs provided, yet required. exit!")
        sys.exit(0)  # TODO Should this exit value be 0?
    elif not ref and query:
        logger.warning("No reference genome specified, going to autopick from input as closest to %s\n"%(query))
        autopick_ref = True
        ref = query
    print("Ref", ref)

    logger.info("""
{}
SETTINGS:
|-refgenome:\t{}
|-genomes:\t{}
|-aligner:\t{}
|-outdir:\t{}
|-OS:\t{}
|-threads:\t{}
{}
    """.format(
        (len(outputDir)+17)*"*",
        "autopick" if ref == '!' else ref_string,
        genome_string,
        args.alignment_program,
        outputDir,
        OSTYPE,
        threads,
        (len(outputDir)+17)*"*"))



    if multiprocessing.cpu_count() < threads:
        logger.warning("You have asked to use more threads than you have available on your machine. This may lead to serious performance degredation with RAxML.")
    logger.info("<<Parsnp started>>")

    #1)read fasta files (contigs/scaffolds/finished/DBs/dirs)
    # logger.info("Reading Genbank file(s) for reference (.gbk) %s"%("\t".join(genbank_files)))
    if len(genbank_file) == 0:
        logger.info("No genbank file provided for reference annotations, skipping..")

    allfiles = []
    fnaf_sizes = {}
    allfile_dict = {}
    reflen = 0
    fnafiles = []
    if ref == "!":
        ref = random.choice(input_files)

    # Check if reference genome is aligned
    with open(ref, 'r') as ff:
        hdr = ff.readline()
        seq = ff.read()
        if hdr[0] != ">":
            logger.critical("Reference {} has improperly formatted header.".format(ref))
            sys.exit(1)
        for line in seq.split('\n'):
            if '-' in line and line[0] != ">":
                logger.warning("Reference genome sequence %s has '-' in the sequence!"%((ref)))
        reflen = len(seq) - seq.count('\n')

    for input_file in input_files[:]:
        try:
            record = list(SeqIO.parse(input_file, "fasta"))
            if len(record) == 0:
                input_files.remove(input_file)
                logger.error(f"{input_file} is an empty file!")
                continue
        except:
            input_files.remove(input_file)
            logger.error(f"Could not parse {input_file}!")
            continue

        ff = open(input_file, 'r')
        hdr = ff.readline()
        seq = ff.read()
        name_flag = True
        seqlen = len(seq) - seq.count('\n')
        if hdr[0] != ">":
            logger.error("{} has improperly formatted header. Skip!".format(input_file))
            continue
        elif '-' in seq:
            seq = seq.split('\n')
            if any('-' in l and ('>' not in l) for l in seq):
                logger.error("Genome sequence %s seems to be aligned! Skip!"%((input_file)))
                continue
        elif seqlen <= 20:
            logger.error("File %s is less than or equal to 20bp in length. Skip!"%(input_file))
            continue
        sizediff = float(reflen)/float(seqlen)

        # EDITED THIS TO CHANGE GENOME THRESHOLD
        # WILL NOW CONSIDER CONCATENATED GENOMES THAT ARE MUCH BIGGER THAN THE REFERENCE
        if curated:
            log_f = logger.warning
            msg = ""
        else:
            log_f = logger.error
            msg = "Skipping..."
        if sizediff <= 0.6:
            log_f("File {} is {:.2f}x longer than reference! {}".format(
                input_file, 1/sizediff, msg))
            if not curated:
                continue
        elif sizediff >= 1.4:
            log_f("File {} is {:.2f}x shorter than reference genome! {}".format(
                input_file, sizediff, msg))
            if not curated:
                continue
        fnafiles.append(input_file)
        fnaf_sizes[input_file] = seqlen
        ff.close()

    # if ref in fnafiles:
        # fnafiles.remove(ref)

    #sort reference by largest replicon to smallest
    if ref in fnafiles:
        fnafiles = [f for f in fnafiles if f != ref]
    elif sortem and os.path.exists(ref) and not autopick_ref:
        sequences = SeqIO.parse(ref, "fasta")
        new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
        SeqIO.write(sequences, new_ref, "fasta")
        ref = new_ref
        # logger.debug("Sorting reference replicons")
        # ff = open(ref, 'r')
        # seqs = ff.read().split(">")[1:]
        # seq_dict = {}
        # seq_len = {}
        # for seq in seqs:
            # try:
                # hdr, seq = seq.split("\n",1)
            # except ValueError:
                # # TODO Why do we ignore when theres a header but no sequence?
                # continue
            # seq_dict[hdr] = seq
            # seq_len[hdr] = len(seq) - seq.count('\n')
        # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
        # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
        # ffo = open(ref, 'w')
        # for hdr, seq in seq_len_sort:
            # ffo.write(">%s\n"%(hdr))
            # ffo.write("%s"%(seq_dict[hdr]))
        # ff.close()
        # ffo.close()
    else:
        ref = genbank_ref

    # TODO stray comment: remove any query sequences 30% diff in length
    allfiles = [os.path.basename(ref)]
    #write INI file
    if not inifile_exists:
        logger.debug("Writing .ini file")
        if xtrafast or 1:
            extend = False

        inifiled = open("%s/template.ini"%(PARSNP_DIR), 'r').read()
        inifiled = inifiled.replace("$REF", ref)
        inifiled = inifiled.replace("$EXTEND", "%d"%(extend))
        inifiled = inifiled.replace("$ANCHORS", str(anchor))
        inifiled = inifiled.replace("$MUMS", str(mum))
        inifiled = inifiled.replace("$MINCLUSTER", str(mincluster))
        inifiled = inifiled.replace("$CLUSTERD", str(cluster))
        inifiled = inifiled.replace("$THREADS", str(threads))
        inifiled = inifiled.replace("$ALIGNER", str(aligner))
        inifiled = inifiled.replace("$DIAGDIFF", str(diagdiff))
        inifiled = inifiled.replace("$RECOMBFILT", "%d"%(xtrafast))
        inifiled = inifiled.replace("$OUTDIR", outputDir)
        if fastmum:
            inifiled = inifiled.replace("$PARTPOS","%d"%(0.2*reflen))
        else:
            inifiled = inifiled.replace("$PARTPOS","%s"%(maxpartition))

        file_string = ""
        for cnt, fna_file in enumerate(fnafiles, 1):
            file_string += "file%d=%s\n"%(cnt, fna_file)
            file_string += "reverse%d=0\n"%(cnt)
        inifiled_mumi = inifiled.replace("$FILES\n", file_string)
        inifiled_mumi = inifiled_mumi.replace("calcmumi=0","calcmumi=1")
        inifile_mumi = open(os.path.join(outputDir, "all_mumi.ini"), 'w')
        inifile_mumi.write(inifiled_mumi)
        inifile_mumi.close()

    #2)get near neighbors (mumi distance)
    if os.path.exists(os.path.join(outputDir, "alltogether.fasta")):
        os.remove(os.path.join(outputDir, "alltogether.fasta"))
    if os.path.exists(os.path.join(outputDir, "blocks/b1")):
        ftrm = glob(os.path.join(outputDir, "blocks/b*"))
        for f in ftrm:
            shutil.rmtree(f)

    fileidx = -1

    hit_dict = {}
    qry_hit_dict = {}
    hdr_dict = {}
    length_dict = {}

    TOTSEQS= len(fnafiles) + 1
    seqids_list = []

    if len(fnafiles) < 1 or ref == "":
        logger.critical("Parsnp requires 2 or more genomes to run, exiting")
        logger.debug("Only files found are: {}\n{} ".format(fnafiles, ref))
        sys.exit(1)

    mumi_dict = {}
    finalfiles = []
    auto_ref = ""
    if not curated:
        logger.info("Recruiting genomes...")
        if use_parsnp_mumi:
            if not inifile_exists:
                command = "%s/bin/parsnp_core %sall_mumi.ini"%(PARSNP_DIR,outputDir+os.sep)
            else:
                # TODO why are we editing the suffix of a provided file?
                command = "%s/bin/parsnp_core %s"%(PARSNP_DIR,inifile.replace(".ini","_mumi.ini"))
            run_command(command)
            try:
                mumif = open(os.path.join(outputDir, "all.mumi"),'r')
                for line in mumif:
                    line = line.rstrip('\n')
                    idx, mi = line.split(":")
                    mumi_dict[int(idx)-1] = float(mi)
            except IOError:
                logger.error("MUMi file generation failed... use all?")
                for i, _ in enumerate(fnafiles):
                    mumi_dict[i] = 1
            lowest_mumi = 100

            if autopick_ref:
                for idx in list(mumi_dict.keys()):
                    #TODO is there a way to organize these via dict rather than list? Seems error prone
                    if mumi_dict[idx] < lowest_mumi:
                        auto_ref = fnafiles[idx]
                        ref = auto_ref
                        lowest_mumi = mumi_dict[idx]
            mumi_f = ""
            if mumi_only and not curated:
                mumi_f = open(os.path.join(outputDir, "recruited_genomes.lst"),'w')


            sorted_x = sorted(iter(mumi_dict.items()), key=operator.itemgetter(1))
            mumivals = []
            for scnt, item in enumerate(sorted_x):
                if scnt > 100 or scnt >= len(sorted_x):
                    break
                if float(item[1]) < float(mumidistance):
                    mumivals.append(float(item[1]))
            minv = minv = numpy.percentile(mumivals, 0) if len(mumivals) > 0 else 1.0
            dvals = mumivals

            stdv = 0
            hpv = 0
            if len(dvals) > 0:
                stdv = numpy.std(dvals)
                hpv = minv + (3*stdv)

            for idx in mumi_dict.keys():
                if mumi_dict[idx] < (float(mumidistance)) or curated:
                    if fastmum and mumi_dict[idx] > hpv:
                        continue
                    #TODO if 1, why is this?
                    if 1 or auto_ref != fnafiles[idx]:
                        if mumi_only:
                            mumi_f.write(os.path.abspath(fnafiles[idx])+",%f"%(mumi_dict[idx])+"\n")
                        finalfiles.append(fnafiles[idx])
                        allfiles.append(fnafiles[idx])

        else:
            try:
                tmp_dir = outputDir
                all_genomes_fname = os.path.join(tmp_dir, "genomes.lst")
                with open(all_genomes_fname, 'w') as all_genomes_f:
                    all_genomes_f.writelines((line + '\n' for line in fnafiles))
                if use_mash:
                    if randomly_selected_ref:
                        logger.warning("You are using a randomly selected genome to recruit genomes from your input...")
                    mash_out = subprocess.check_output([
                            "mash", "dist", "-t",
                            "-d", str(max_mash_dist),
                            "-p", str(threads),
                            ref,
                            "-l", all_genomes_fname],
                        stderr=open(os.path.join(outputDir, "mash.err"), 'w')).decode('utf-8')
                    finalfiles = [line.split('\t')[0] for line in mash_out.split('\n')[1:] if line != '' and len(line.split('\t')) > 1 and line.split('\t')[1].strip() != '']
                elif use_ani:
                    if randomly_selected_ref:
                        subprocess.check_call([
                                "fastANI",
                                "--ql", all_genomes_fname,
                                "--rl", all_genomes_fname,
                                "-t", str(threads),
                                "-o", os.path.join(outputDir, "fastANI.tsv")],
                            stderr=open(os.path.join(outputDir, "fastANI.err"), 'w'))
                    else:
                        subprocess.check_call([
                                "fastANI",
                                "-q", ref,
                                "--rl", all_genomes_fname,
                                "-t", str(threads),
                                "-o", os.path.join(outputDir, "fastANI.tsv")],
                            stderr=open(os.path.join(outputDir, "fastANI.err"), 'w'))
                    genome_to_genomes = defaultdict(set)
                    with open(os.path.join(outputDir, "fastANI.tsv")) as results:
                        for line in results:
                            # FastANI results file -> Query, Ref, ANI val, extra stuff,,,
                            line = line.split('\t')
                            if float(line[2]) >= min_ani_cutoff:
                                genome_to_genomes[line[0]].add(line[1])

                        # for g in genome_to_genomes:
                            # print(len(g))
                        ani_ref = max(genome_to_genomes, key=(lambda key: len(genome_to_genomes[key])))
                        if autopick_ref:
                            auto_ref = ani_ref
                        finalfiles = list(genome_to_genomes[ani_ref])

                # shutil.rmtree(tmp_dir)
            except subprocess.CalledProcessError as e:
                logger.critical(
                    "Recruitment failed with exception {}. More details may be found in the *.err output log".format(str(e)))
                # shutil.rmtree(tmp_dir)
            allfiles.extend(finalfiles)

    if curated:
        for f in fnafiles:
            if f not in finalfiles:
                finalfiles.append(f)
            if f not in allfiles:
                allfiles.append(f)

    if mumi_only:
        mumi_f.close()
        sys.exit(1)

    orig_auto_ref = auto_ref
    if os.path.exists(auto_ref) and autopick_ref:
        #TODO This code block is duplicated
        ff = open(auto_ref, 'r')
        seqs = ff.read().split(">")[1:]
        seq_dict = {}
        seq_len = {}
        for seq in seqs:
            try:
                hdr, seq = seq.split("\n",1)
            except ValueError:
                continue
            seq_dict[hdr] = nt
            seq_len[hdr] = len(seq) - seq.count('\n')
        seq_len_sort = sorted(seq_len.iteritems(), key=operator.itemgetter(1))
        seq_len_sort.reverse()
        auto_ref = os.path.join(outputDir, os.path.basename(auto_ref)+".ref")
        ffo = open(ref, 'w')
        for item in seq_len_sort:
            ffo.write(">%s\n"%(item[0]))
            ffo.write(seq_dict[item[0]])
        ff.close()
        ffo.close()
        ref = auto_ref

    inifiled_closest = inifiled
    #TODO This code is duplicated
    if not inifile_exists:
        if len(finalfiles) < 1 or ref == "":
            logger.critical("Parsnp requires 2 or more genomes to run, exiting\n")
            sys.exit(1)

        file_string = ""
        cnt = 1
        file_string_closest = ""
        #TODO whats the point of iterating over one file?
        for cnt, f in enumerate(finalfiles[0:1], 1):
            file_string_closest+="file%d=%s\n"%(cnt, f)
            file_string_closest+="reverse%d=0\n"%(cnt)
        for cnt, f in enumerate(finalfiles, 1):
            file_string+="file%d=%s\n"%(cnt, f)
            file_string+="reverse%d=0\n"%(cnt)
        inifiled = inifiled.replace("$FILES\n",file_string)
        #new, output unaligned regions
        inifiled = inifiled.replace("$UNALIGNED",unaligned)
        inifiled_closest = inifiled.replace("$FILES\n",file_string_closest)

        if fastmum:
            inifiled = inifiled.replace("p=%d"%(0.2*reflen),"p=%s"%(maxpartition))
            inifiled_closest = inifiled.replace("p=%d"%(0.2*reflen),"p=%s"%(maxpartition))
        if autopick_ref:
            inifiled = inifiled.replace(orig_auto_ref,auto_ref)
            inifiled = inifiled.replace(auto_ref,"tmp_"+auto_ref)
            inifiled = inifiled.replace(query,auto_ref)
            inifiled = inifiled.replace("tmp_"+auto_ref,query)
            inifiled_closest = inifiled_closest.replace(auto_ref,"tmp_"+auto_ref)
            inifiled_closest = inifiled_closest.replace(query,auto_ref)
            inifiled_closest = inifiled_closest.replace("tmp_"+auto_ref,query)

        inifile = open(outputDir+os.sep+"parsnpAligner.ini",'w')
        inifile.write(inifiled)
        inifile.close()
        inifile_closest = open(outputDir+os.sep+"psnn.ini",'w')
        inifile_closest.write(inifiled_closest)
        inifile_closest.close()


    #3)run parsnp (cores, grid?)
    logger.info("Running Parsnp multi-MUM search and libMUSCLE aligner...")
    blocks_dir = os.path.join(outputDir, "blocks")
    if not os.path.exists(blocks_dir):
        os.mkdir(blocks_dir)
    command = ""
    run_parsnp = 1
    if run_parsnp:
        successful_run = False
        maxruns = 2
        runcnt = 0
        while not successful_run:
            if not inifile_exists:
                if command == "" and xtrafast and 0:
                    command = "%s/parsnpA_fast %sparsnpAligner.ini"%(PARSNP_DIR,outputDir+os.sep)
                elif command == "":
                    command = "%s/bin/parsnp_core %sparsnpAligner.ini"%(PARSNP_DIR,outputDir+os.sep)
                else:
                    command = "%s/bin/parsnp_core %spsnn.ini"%(PARSNP_DIR,outputDir+os.sep)
            else:
                if not os.path.exists(inifile):
                    logger.error("ini file %s does not exist!\n"%(inifile))
                    sys.exit(1)
                command = "%s/bin/parsnp_core %s"%(PARSNP_DIR,inifile)
            run_command(command)

            if not os.path.exists(os.path.join(outputDir, "parsnpAligner.xmfa")):
                successful_run = False
                runcnt += 1
                if runcnt >= 2:
                    logger.critical("Set of recruited genomes are too divergent for parsnp, please reduce MUMi (%f) and relaunch\n"%(float(mumidistance)))
                    sys.exit(1)
            else:
                successful_run = True
                runcnt += 1
                break
        shutil.move(
                os.path.join(outputDir, "parsnpAligner.xmfa"),
                os.path.join(outputDir, "parsnp.xmfa"))
    xmfafile = open(os.path.join(outputDir, "parsnp.xmfa"),'r')

    file2hdr_dict = {}
    fileid = ""
    blockfiles = []

    #get coverage
    coverage = 0
    totlength = 0
    totseqs = 0
    try:
        cf = open(os.path.join(outputDir, "parsnpAligner.log"))
        for line in cf:
            if "Total coverage among all sequences:" in line:
                coverage = line.split(":",1)[-1].replace("\n","")
                coverage = float(coverage.replace("%",""))/100.0
            elif "Length:" in line:
                totlength += int(line.split(":",1)[-1].replace("\n","").split("bps")[0])
                totseqs += 1
    except IOError:
        logger.critical("ParsnpAligner.log missing, parsnpAligner failed.")
        sys.exit(1)

    #update thresholds
    if coverage < 0.1 and not curated:
        if coverage <= 0.01:
            logger.critical("""Aligned regions cover less than 1% of reference genome, something is not right
Adjust params and rerun. If issue persists please submit a GitHub issue""")
            sys.exit(1)
        else:
            logger.warning("""Aligned regions cover less than 10% of reference genome!
Please verify recruited genomes are all strain of interest""")
    else:
        pass
    #print("-->Getting list of LCBs.."
    allbfiles = glob(os.path.join(blocks_dir, "b*/*"))
    blockfiles = []
    block_startpos = []
    block_dict = {}
    for f in allbfiles:
        if os.path.isfile(f):
            if "seq.fna" in f:
                blockfiles.append(f)
                lf = open(f, 'r')
                header = lf.readline()
                if header[0] != ">":
                    logger.error("Error with LCB: %s\n"%(f))
                    continue

                inf = header.split("+",1)[0]

                rseq = ""
                while 1:
                    lff = lf.readline()
                    if lff[0] == ">":
                        break
                    rseq += lff.replace("\n","")

                spos,epos = inf.split(":",1)[-1].split("-",1)
                block_startpos.append(int(spos))
                block_dict[f] = [int(spos),int(epos), rseq]
                lf.close()
    run_repeat_filter = filtreps

    #initiate parallelPhiPack tasks
    run_recomb_filter = 0

    if xtrafast:
        run_recomb_filter = 1
    else:
        run_recomb_filter = 0

    recombination_sites = {}
    bedfile = ""
    bedfile_dict = {}
    if run_recomb_filter and len(blockfiles) > 0:
        logger.info("Running PhiPack on LCBs to detect recombination...")
        bedfile = open(os.path.join(outputDir, "parsnp.rec"), 'w')
        tasks = []
        processed = []
        for icnt, f in enumerate(blockfiles):
            seq1 = ""
            try:
                bf = open(f, 'r')
                seq1 = bf.read().split(">")[1].split("\n",1)[-1]
                seq1 = seq1.replace("\n","")
                bf.close()
            except IOError:
                pass

            processed.append(f)
            params = {}
            path, f = f.rsplit(os.path.sep,1)
            params["jobID"] = len(tasks)
            params["query"] = f
            params["seqlen"] = len(seq1)
            params["spos"] = block_startpos[icnt]
            params["dir"] = path
            params["output"] = os.path.join(path, "Profile.csv")
            tasks.append(params)

        #run parallelPhiPack
        pool = Pool(processes=int(threads))
        result = pool.map_async(parallelPhiWrapper,tasks).get()

        for i in result:
            if (i["status"] == 1):
                #process output
                recregions = ""
                block_spos = tasks[i["jobID"]]["spos"]
                try:
                    recregions = open(tasks[i["jobID"]]["output"],'r').read()
                except IOError:
                    logger.error("File %s doesn't exist, no rec regions or error in PhiPack\n"%(tasks[i["jobID"]]["output"]))
                    continue
                reclines = recregions.split("\n")
                prevpos = 0

                for line in reclines:
                    try:
                        pos,eval = line.split(",")
                    except ValueError:
                        continue
                    pos = int(pos)
                    eval = float("%.5f"%(float(eval)))
                    if eval < 0.01 and eval >= 0:
                        idx = 0
                        srpos = 0
                        if pos-50 > 0:
                            srpos = (pos-50)+block_spos
                        else:
                            srpos = block_spos
                        eval = abs(eval)
                        if not multifasta:
                            bedfile_dict[srpos] = "1\t%s\t%s\tREC\t%.3f\t+\n"%(srpos,pos+50+block_spos,eval)
                        else:
                            chrnum = 1
                            chr_spos = list(ref_seqs.keys())
                            for cs in ref_seqs:
                                if block_spos < len(chr_spos):
                                    chrnum = ref_seqs[cs]
                            bedfile_dict[srpos] = "%d\t%s\t%s\tREC\t%.3f\t+\n"%(chrnum,srpos,pos+50+block_spos,eval)

                qfile = tasks[i["jobID"]]["query"]

            elif i["status"] != 2:
                logger.critical("Parallel phipack job %d failed\n"%(i["jobID"]))
                raise IOError

        pool.close()
        pool.join()
        brkeys = list(bedfile_dict.keys())
        brkeys.sort()
        for key in brkeys:
            bedfile.write(bedfile_dict[key])
        bedfile.close()

    run_lcb_trees = 0

    annotation_dict = {}
    #TODO always using xtrafast?
    parsnp_output = f"{outputDir}/parsnp.xmfa"
    if args.extend_lcbs:
        xmfa_file = f"{outputDir}/parsnp.xmfa"
        with TemporaryDirectory() as temp_directory:
            original_maf_file = f"{outputDir}/parsnp-original.maf"
            extended_xmfa_file = f"{outputDir}/parsnp-extended.xmfa"
            fname_contigid_to_length, fname_contigidx_to_header, fname_to_seqrecord = ext.get_sequence_data(
                    ref,
                    finalfiles,
                    index_files=False)
            fname_to_contigid_to_coords, fname_header_to_gcontigidx = ext.xmfa_to_maf(
                    xmfa_file,
                    original_maf_file,
                    fname_contigidx_to_header,
                    fname_contigid_to_length)
            packed_write_result = ext.write_intercluster_regions(finalfiles + [ref], temp_directory, fname_to_contigid_to_coords)
            fname_contigid_to_cluster_dir_to_length, fname_contigid_to_cluster_dir_to_adjacent_cluster = packed_write_result
            cluster_files = glob(f"{temp_directory}/*.fasta")
            clusterdir_expand, clusterdir_len = ext.get_new_extensions(
                    cluster_files,
                    args.match_score,
                    args.mismatch_penalty,
                    args.gap_penalty)
            ext.write_extended_xmfa(
                    original_maf_file,
                    extended_xmfa_file,
                    temp_directory,
                    clusterdir_expand,
                    clusterdir_len,
                    fname_contigid_to_cluster_dir_to_length,
                    fname_contigid_to_cluster_dir_to_adjacent_cluster,
                    fname_header_to_gcontigidx,
                    fname_contigid_to_length,
                    args.extend_ani_cutoff,
                    args.extend_indel_cutoff,
                    threads)
            parsnp_output = extended_xmfa_file
            os.remove(original_maf_file)


    if xtrafast or 1:
        #add genbank here, if present
        if len(genbank_ref) != 0:
            rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"
            for file in genbank_files:
                rnc += " -g %s " %(file)
            run_command(rnc)
        else:
            run_command(f"harvesttools -q -o {outputDir}/parsnp.ggr -f {ref} -x {parsnp_output}")

        if run_recomb_filter:
            run_command("harvesttools -q -b %s/parsnp.rec,REC,\"PhiPack\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(outputDir,outputDir,outputDir))
        if run_repeat_filter:
            run_command("harvesttools -q -b %s,REP,\"Intragenomic repeats > 100bp\" -o %s/parsnp.ggr -i %s/parsnp.ggr"%(repfile,outputDir,outputDir))

        run_command("harvesttools -q -i %s/parsnp.ggr -S "%(outputDir)+outputDir+os.sep+"parsnp.snps.mblocks")
        if generate_vcf:
            run_command("harvesttools -q -i %s/parsnp.ggr -V "%(outputDir)+outputDir+os.sep+"parsnp.vcf")

    if not args.skip_phylogeny:
        logger.info("Reconstructing core genome phylogeny...")
        mblocks_seqs = SeqIO.parse(os.path.join(outputDir, "parsnp.snps.mblocks"), "fasta")
        for seq in mblocks_seqs:
            if len(seq) < 6:
                logger.warning("Not enough SNPs to use RaxML. Attempting to use FastTree instead...")
                use_fasttree = True
                break
        if not use_fasttree:
            with TemporaryDirectory() as raxml_output_dir:
                command = "raxmlHPC-PTHREADS -m GTRCAT -p 12345 -T %d -s %s -w %s -n OUTPUT"%(threads,outputDir+os.sep+"parsnp.snps.mblocks", raxml_output_dir)
                run_command(command)
                os.system("mv {}/RAxML_bestTree.OUTPUT {}".format(raxml_output_dir, outputDir+os.sep+"parsnp.tree"))

            mblocks_file = os.path.join(outputDir, "parsnp.snps.mblocks")

        if use_fasttree:
            if shutil.which("FastTreeMP") is not None:
                os.environ["OMP_NUM_THREADS"] = str(threads)
                command = "FastTreeMP -nt -quote -gamma -slow -boot 100 "+outputDir+os.sep+"parsnp.snps.mblocks > "+outputDir+os.sep+"parsnp.tree"
                run_command(command)
            else:
                logger.info("FastTreeMP failed. Trying fasttree...")
                command = "fasttree -nt -quote -gamma -slow -boot 100 "+outputDir+os.sep+"parsnp.snps.mblocks > "+outputDir+os.sep+"parsnp.tree"
                run_command(command)



        #7)reroot to midpoint
        if os.path.exists("outtree"):
             os.remove("outtree")

        if reroot_tree and len(finalfiles) > 1:
            try:
                mtree = open("%sparsnp.tree"%(outputDir+os.sep), 'r')
                mtreedata = mtree.read()
                mtreedata = mtreedata.replace("\n","")
                tree = dendropy.Tree.get_from_string(mtreedata,"newick")
                tree.reroot_at_midpoint(update_bipartitions=False)
                mftreef = tree.as_string('newick').split(" ",1)[1]
                #print mftreef
                mtreef = open(outputDir+os.sep+"parsnp.final.tree",'w')
                mtreef.write(mftreef)
                mtreef.close()
                os.system("mv %s %s"%(outputDir+os.sep+"parsnp.final.tree",outputDir+os.sep+"parsnp.tree"))
            except IOError:
                logger.error("Cannot process {} output, skipping midpoint reroot..\n".format("fasttree" if args.use_fasttree else "RaxML"))


        if len(use_gingr) > 0:
            logger.info("Creating Gingr input file..")
            if xtrafast or 1:
                #if newick available, add
                #new flag to update branch lengths
                run_command("harvesttools --midpoint-reroot -u -q -i "+outputDir+os.sep+"parsnp.ggr -o "+outputDir+os.sep+"parsnp.ggr -n %s"%(outputDir+os.sep+"parsnp.tree "))


    t2 = time.time()
    elapsed = float(t2)-float(t1)
    if float(elapsed)/float(60.0) > 60:
        logger.info("Aligned %d genomes in %.2f hours"%(totseqs,float(elapsed)/float(3600.0)))
    elif float(elapsed) > 60:
        #TODO just format the time to get rid of the above formatting
        logger.info("Aligned %d genomes in %.2f minutes"%(totseqs,float(elapsed)/float(60.0)))
    else:
        logger.info("Aligned %d genomes in %.2f seconds"%(totseqs,float(elapsed)))
    #cleanup
    rmfiles = glob(os.path.join(outputDir, "*.aln"))
    #rmfiles2 = glob.glob(outputDir+os.sep+"blocks/b*/*")
    rmfiles3 = glob(os.path.join(outputDir, "blocks/b*"))
    for f in rmfiles:
        os.remove(f)
    for f in rmfiles3:
        shutil.rmtree(f)

    filepres = 0
    logger.info("Parsnp finished! All output available in %s"%(outputDir))
    logger.debug("Validating output directory contents")
    if args.skip_phylogeny or os.path.exists("%sparsnp.tree"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.tree"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.tree:\t\tnewick format tree is missing!")
    if os.path.exists("%sparsnp.ggr"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.ggr"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.ggr:\t\tharvest input file for gingr (GUI) is missing!")
    if os.path.exists("%sparsnp.xmfa"%(outputDir+os.sep)) and os.path.getsize("%sparsnp.xmfa"%(outputDir+os.sep)) > 0:
        filepres+=1
    else:
        logger.error("parsnp.xmfa:\t\tXMFA formatted multi-alignment is missing")
    # if filepres != 3:
        # logger.critical("Output files missing, something went wrong. Check logs and relaunch or contact developers for assistance")

    if os.path.exists("%sblocks"%(outputDir+os.sep)):
        os.rmdir("%sblocks"%(outputDir+os.sep))
    if os.path.exists("allmums.out"):
        os.remove("allmums.out")

    if not VERBOSE and os.path.exists("parsnpAligner.ini"):
        os.remove("parsnpAligner.ini")

    prefix = os.path.join(outputDir, os.path.splitext(os.path.basename(ref))[0])
    if not VERBOSE and os.path.exists("%s.coords"%(prefix)):
        os.remove("%s.coords"%(prefix))

    if not VERBOSE and os.path.exists("%s.delta"%(prefix)):
        os.remove("%s.delta"%(prefix))

    for f in glob(os.path.join(outputDir,"*.reps")):
        if not VERBOSE and os.path.exists(f):
            os.remove(f)

    # for f in glob(os.path.join(outputDir, "*.ref")):
        # if not VERBOSE and os.path.exists(f):
            # os.remove(f)

    if not VERBOSE and os.path.exists("%s/psnn.ini"%(outputDir)):
        os.remove("%s/psnn.ini"%(outputDir))

    if not VERBOSE and os.path.exists("%s/all_mumi.ini"%(outputDir)):
        os.remove("%s/all_mumi.ini"%(outputDir))

    if not VERBOSE and os.path.exists("%s/tmp"%(outputDir)):
        shutil.rmtree("%s/tmp"%(outputDir))

    # if os.path.exists("%s/parsnp.snps.mblocks"%(outputDir)):
        # os.remove("%s/parsnp.snps.mblocks"%(outputDir))
    if os.path.exists("%s/parsnp.snps.mblocks.reduced"%(outputDir)):
        os.remove("%s/parsnp.snps.mblocks.reduced"%(outputDir))

    if not VERBOSE and os.path.exists("%s/all.mumi"%(outputDir)):
        os.remove("%s/all.mumi"%(outputDir))

    if os.path.exists(use_gingr):
        #check if available first
        rc = 0
        if binary_type == "osx":
            logger.info("Launching gingr..")
            os.system("open -n %s --args %s/parsnp.ggr"%(use_gingr,outputDir))

