/*
 * Decompiled with CFR 0.152.
 */
package edu.msu.cme.rdp.readseq.utils.kmermatch;

import edu.msu.cme.rdp.readseq.SequenceType;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.utils.IUBUtilities;
import edu.msu.cme.rdp.readseq.utils.SeqUtils;
import edu.msu.cme.rdp.readseq.utils.kmermatch.KmerMatchCore;
import edu.msu.cme.rdp.readseq.utils.orientation.ProteinWordGenerator;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.TreeSet;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class ProteinSeqMatch
extends KmerMatchCore {
    private HashMap<String, HashSet<String>> refWordMap = new HashMap();
    private HashMap<String, Sequence> refSeqMap = new HashMap();
    private ProteinWordGenerator proteinWordGenerator = null;
    private static final float SabThreshold = 0.6f;

    public ProteinSeqMatch(String seqFile, int wordSize) throws IOException {
        Sequence seq;
        this.proteinWordGenerator = new ProteinWordGenerator(wordSize);
        SequenceReader parser = new SequenceReader(new File(seqFile));
        while ((seq = parser.readNextSequence()) != null) {
            this.refSeqMap.put(seq.getSeqName(), seq);
            this.refWordMap.put(seq.getSeqName(), this.proteinWordGenerator.parseProtein(SeqUtils.getUnalignedSeqString(seq.getSeqString())));
        }
        parser.close();
    }

    public ProteinSeqMatch(String seqFile) throws IOException {
        this(seqFile, 4);
    }

    public ProteinSeqMatch(List<Sequence> refSeqs, int wordSize) {
        this.proteinWordGenerator = new ProteinWordGenerator(wordSize);
        for (Sequence seq : refSeqs) {
            this.refSeqMap.put(seq.getSeqName(), seq);
            this.refWordMap.put(seq.getSeqName(), this.proteinWordGenerator.parseProtein(SeqUtils.getUnalignedSeqString(seq.getSeqString())));
        }
    }

    @Override
    public ArrayList<KmerMatchCore.BestMatch> findTopKMatch(Sequence seq, int k) {
        float sab;
        float minWordCount;
        HashSet<String> tempSet;
        HashSet<String> targetWordSet;
        float queryWordSize;
        HashSet<String> queryWordSet;
        TreeSet<KmerMatchCore.BestMatch> orderedResultSet = new TreeSet<KmerMatchCore.BestMatch>(new KmerMatchCore.ResultComparator());
        SequenceType seqtype = SeqUtils.guessSequenceType(seq);
        if (seqtype == SequenceType.Nucleotide) {
            queryWordSet = this.proteinWordGenerator.parseNuclAllFrames(seq.getSeqString());
            queryWordSize = (float)queryWordSet.size() / 3.0f;
        } else {
            queryWordSet = this.proteinWordGenerator.parseProtein(seq.getSeqString());
            queryWordSize = queryWordSet.size();
        }
        float tempBestSab = 0.0f;
        for (Sequence target : this.refSeqMap.values()) {
            targetWordSet = this.refWordMap.get(target.getSeqName());
            tempSet = new HashSet<String>();
            tempSet.addAll(queryWordSet);
            minWordCount = queryWordSize <= (float)targetWordSet.size() ? queryWordSize : (float)targetWordSet.size();
            tempSet.retainAll(targetWordSet);
            sab = (float)tempSet.size() / minWordCount;
            if (sab >= tempBestSab) {
                tempBestSab = sab;
            }
            orderedResultSet.add(new KmerMatchCore.BestMatch(target, sab, false));
        }
        if (seqtype == SequenceType.Nucleotide) {
            if (tempBestSab < 0.6f) {
                queryWordSet = this.proteinWordGenerator.parseNuclAllFrames(IUBUtilities.reverseComplement(seq.getSeqString()));
                for (Sequence target : this.refSeqMap.values()) {
                    targetWordSet = this.refWordMap.get(target.getSeqName());
                    tempSet = new HashSet();
                    tempSet.addAll(queryWordSet);
                    minWordCount = queryWordSize <= (float)targetWordSet.size() ? queryWordSize : (float)targetWordSet.size();
                    tempSet.retainAll(targetWordSet);
                    sab = (float)tempSet.size() / minWordCount;
                    orderedResultSet.add(new KmerMatchCore.BestMatch(target, sab, true));
                }
            }
        }
        ArrayList<KmerMatchCore.BestMatch> topkMatchList = new ArrayList<KmerMatchCore.BestMatch>();
        for (KmerMatchCore.BestMatch m : orderedResultSet) {
            if (topkMatchList.size() >= k) continue;
            topkMatchList.add(m);
        }
        return topkMatchList;
    }

    public static void main(String[] args) throws IOException {
        Sequence seq;
        String usage = "Usage: protein_ref.fa query.fa outfile word_size knn\n  This program takes a nucleotide or protein query sequence file, returns the top k best matching protein reference sequences based on amino acid kmer matching\n  word_size 4 is recommended for the best performance. Range from 3 to 6 is recommended.\n  protein_ref.fa must be protein sequences\n  query.fa can be either protein sequences or nucleotide sequences";
        if (args.length != 5) {
            System.err.println(usage);
            System.exit(1);
        }
        PrintStream out = new PrintStream(new File(args[2]));
        int wordSize = Integer.parseInt(args[3]);
        int k = Integer.parseInt(args[4]);
        ProteinSeqMatch theObj = new ProteinSeqMatch(args[0], wordSize);
        SequenceReader queryReader = new SequenceReader(new File(args[1]));
        while ((seq = queryReader.readNextSequence()) != null) {
            ArrayList<KmerMatchCore.BestMatch> results = theObj.findTopKMatch(seq, k);
            for (KmerMatchCore.BestMatch m : results) {
                out.println(seq.getSeqName() + "\t" + m.getBestMatch().getSeqName() + "\t" + m.getSab() + "\t" + m.getBestMatch().getDesc());
            }
        }
        queryReader.close();
        out.close();
    }
}

