function [strainLabels, snpData, snpPositions, totalSequenceLength] = ...
    readFastaData(fastaInputFile)
% The function reads in data in Fasta format and creates data matrix 
% "snpData" which contains the SNPs. "snpPositions" specifies the genomic 
% locations of the SNPs. Missing values are not considered as different 
% from other values (i.e. a position is NOT marked as SNP if only a single 
% value has been observed even if some strains have a missing value at that 
% position).
%
% Missing data must be marked either with '-' or '?'. Otherwise, the
% sequences can be either lower or uppercase, or a mixture of these two.
% Only values 'A', 'C', 'G', 'T', and 'a', 'c', 'g', 't' are accepted.
%
% "fastaInputFile" is the relative path to the input file.
%
% "strainLabels" contains the labels for the sequences.
%
% TESTING EXAMPLE:
% [strainLabels, data, snpPositions, totalSequenceLength] = ...
%     readFastaData('..\data\testData\fastaTest.aln');
% isequal(strainLabels,{'Seq1';'Seq2';'Seq3';'Seq4'})
% isequal(data,[4 2 2 1;4 2 2 3;1 2 2 -9;4 3 3 3])
% isequal(snpPositions,[2 6 7 8])
% isequal(totalSequenceLength,8)


% GO THROUGH ALL STRAINS ONCE, keep track of the variable sites:
counter = 1;
fastaData = fastaread(fastaInputFile,'Blockread',counter);
totalSequenceLength = length(fastaData.Sequence);

observedValues = false(5,totalSequenceLength);  
% if observedValues(i,j)==true, then it means that at j:th position, i:th
% value has been observed, where i has five possible values:'A','C','G',
% 'T','miss'.

basesUpper = {'A','C','G','T'};
basesLower = {'a','c','g','t'};

strainLabels = cell(5000,1);
endOfFile = false;

disp('Scanning strains');
while ~endOfFile
    %disp([' ' num2str(counter) ' strains scanned.']);
    strainLabels{counter} = fastaData.Header;
    
    % Convert sequence into numeric format:
    sequence = zeros(1,totalSequenceLength);
    for baseIndex = 1:4
        % A, C, G, and T will be coded by 1, 2, 3, and 4, respectively.
        positions = fastaData.Sequence==basesUpper{baseIndex} | ...
            fastaData.Sequence==basesLower{baseIndex};
        sequence(positions)=baseIndex;
    end
    missingBasePositions = fastaData.Sequence=='-' | ...
        fastaData.Sequence=='?' | fastaData.Sequence=='N';
    sequence(missingBasePositions)=5;   % 5 for "missing".
    
    sequence(sequence==0) = 5;  % If some other symbol is present, mark it as missing.
    
    observedValues(sub2ind([5,totalSequenceLength],sequence,...
        1:totalSequenceLength)) = true;
    
    counter = counter+1;
    try
        fastaData = fastaread(fastaInputFile,'Blockread',counter);
    catch
        endOfFile = true;
    end
end

nStrains = counter-1;
strainLabels = strainLabels(1:nStrains);

% GO THROUGH THE STRAINS AGAIN, save SNPs.
snpPositions = find(sum(observedValues(1:4,:))>1);
snpData = zeros(nStrains,length(snpPositions));
counter = 1;

fastaData = fastaread(fastaInputFile,'Blockread',counter);
endOfFile = false;

disp('Processing input data');
while ~endOfFile
    %disp([' ' num2str(counter) '/' num2str(nStrains) ' processed.']);
    
    % Convert sequence into numeric format:
    sequence = zeros(1,length(snpPositions));
    for baseIndex = 1:4
        % A, C, G, and T will be coded by 1, 2, 3, and 4, respectively.
        positions = fastaData.Sequence(snpPositions)==basesUpper{baseIndex} | ...
            fastaData.Sequence(snpPositions)==basesLower{baseIndex};
        sequence(positions)=baseIndex;
    end
    missingBasePositions = fastaData.Sequence(snpPositions)=='-' | ...
        fastaData.Sequence(snpPositions)=='?' | ...
        fastaData.Sequence(snpPositions)=='N';
    sequence(missingBasePositions)=-9;   % -9 for "missing".
    
    snpData(counter,:) = sequence;
    
    counter = counter+1;
    try
        fastaData = fastaread(fastaInputFile,'Blockread',counter);
    catch
        endOfFile = true;
    end
end