# -------------------------------------------------------------------------
#     This file is part of mMass - the spectrum analysis tool for MS.
#     Copyright (C) 2005-07 Martin Strohalm <mmass@biographics.cz>

#     This program is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.

#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.

#     Complete text of GNU GPL can be found in the file LICENSE in the
#     main directory of the program
# -------------------------------------------------------------------------

# Function: Load and parse sequence from FASTA format.

# load libs
import wx
import string
import re

# load modules
from dlg_select_sequence import dlgSelectSequence


class fastaDoc:
    """ Load and parse sequence data from FASTA format. """

    # ----
    def __init__(self, parent):
        self.parent = parent
        self.data = {
                    'title':'',
                    'sequence':[]
                    }
    # ----


    # ----
    def getSequence(self, path):
        """ Load and process data from FASTA sequence file. """

        # get sequence file
        try:
            sequenceFile = file(path, 'rb')
            sequenceData = sequenceFile.readlines()
            sequenceFile.close()
        except IOError:
            return False

        # parse sequence data
        sequences = self.parseSequence(sequenceData)
        if not sequences:
            return False

        # if one sequence only
        if len(sequences) == 1:
            self.data['title'] = sequences[0][0]
            self.data['sequence'] = list(sequences[0][1])

        # select one sequence if multi-sequence file
        else:
            dlg = dlgSelectSequence(self.parent, sequences)
            if dlg.ShowModal() == wx.ID_OK:
                seqId = dlg.selectedSequence
                dlg.Destroy()
                self.data['title'] = sequences[seqId][0]
                self.data['sequence'] = list(sequences[seqId][1])
            else:
                dlg.Destroy()
                return None

        return self.data
    # ----


    # ----
    def parseSequence(self, data):
        """ Parse given data and get sequences. """

        sequences = []
        rawSequence = ''
        title = ''
        aminoPattern = re.compile('^([ACDEFGHIKLMNPQRSTVWY]+)$')

        # check lines
        for line in data:
            line = string.strip(line)

            # empty line
            if line == '':
                continue

            # FASTA format header
            if line[0] == '>':
                sequences.append([title, rawSequence])
                title = line[1:]
                rawSequence = ''

            # sequence body
            else:

                # remove whitespaces
                for char in ('\t','\n','\r','\f','\v',' ', '-', '*'):
                    line = line.replace(char, '')

                # remove numbers
                for char in ('0','1','2','3','4','5','6','7','8','9'):
                    line = line.replace(char, '')

                # all uppercase
                line = string.upper(line)

                # check sequence
                if aminoPattern.match(line):
                    rawSequence += line
                else:
                    return False

        sequences.append([title, rawSequence])
        if len(sequences) != 1:
            del(sequences[0])

        return sequences
    # ----
