#!/usr/bin/env python

#  Copyright (c) 2016-2019, Broad Institute, Inc. All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
#  * Neither the name Broad Institute, Inc. nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
#  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
#  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
#  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHsER IN CONTRACT, STRICT LIABILITY,
#  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

import argparse
import os
import gzip
import bz2
from pathlib import Path
from Bio import SeqIO
#from strainge.io.utils import open_compressed



parser = argparse.ArgumentParser(description="Parse RefSeq fasta files to chromosome & plasmid fasta files")
parser.add_argument("--dir", "-d", default=".", help="output directory")
parser.add_argument("fasta", nargs='+', help="fasta file(s) to extract")
args = parser.parse_args()

def strainName(description):
    """Turn a RefSeq sequence description into a strain name"""

names = set()

def open_compressed(filename):
    if not isinstance(filename, Path):
        filename = Path(filename)

    if filename.suffix == ".gz":
        f = gzip.open(filename, "rt")
    elif filename.suffix == ".bz2":
        f = bz2.open(filename, "rt")
    else:
        f = open(filename)
    return f

def open_fasta_file(file_name):
    return SeqIO.parse(open_compressed(file_name), "fasta")



def extract(seqFile, outputDir):
    global names
    for seq in open_fasta_file(seqFile):
        #print seq.description
        # remove all after comma
        name = seq.description.split(',')[0]
        name = name.replace("Escherichia coli", "E_coli")
        name = name.replace("Escherichia ", "E_")
        name = name.replace("Enterococcus", "En")
        name = name.replace("Shigella", "Sh")
        name = name.replace("Staphylococcus", "S")
        name = name.replace("Citrobacter", "Citro")
        name = name.replace("Streptococcus", "Strep")
        name = name.replace("Klebsiella", "K")
        name = name.replace("Pseudomonas", "Ps")
        name = name.replace(" strain", "")
        name = name.replace(" str.", "")
        name = name.replace(" str ", " ")
        name = name.replace(" subsp.", "")
        name = name.replace(" subsp ", " ")
        name = name.replace(" substrain", "")
        name = name.replace(" substr.", "")
        name = name.replace(" chromosome", "")
        name = name.replace(" sequence", "")
        name = name.replace(" complete", "")
        name = name.replace(" genome", "")
        name = name.replace(" assembly", "")
        name = name.replace(" DNA", "")
        name = name.replace("'", "")
        name = name.replace(":", "-")
        name = name.replace("/", " ")
        name = name.replace("/", " ")
        name = name.replace(".", " ")
        name = name.replace("(", " ")
        name = name.replace(")", " ")
        if name.find('=') > 0:
            name = name[:name.index('=')]
        name = name.strip()
        words = name.split(' ')

        name = '_'.join(words[2:])
        size = len(seq.seq)
        if name in names:
            name += "_" + seq.name
        print(os.path.dirname(seqFile) + ' ' + seq.name + ' ' + name)
        path = os.path.join(outputDir, name + ".fasta")
        SeqIO.write(seq, path, "fasta")
        names.add(name)


for fa in args.fasta:
    extract(fa, args.dir)




