/*------------------------------------------------------
 Maximum likelihood estimation 
 of migration rate  and effectice population size
 using a Metropolis-Hastings Monte Carlo algorithm                            
 -------------------------------------------------------                        
 D A T A   R O U T I N E S 
 
 creates data structures,
 read data (Electrophoretic loci, sequences, microsats),
 feeds data into tree (?),
 prints data,
 destroys data.
 
 Peter Beerli 1996, Seattle
 beerli@csit.fsu.edu
 
Copyright 1996-2002 Peter Beerli and Joseph Felsenstein, Seattle WA
Copyright 2003-2004 Peter Beerli, Tallahassee FL
 
 This software is distributed free of charge for non-commercial use
 and is copyrighted. Of course, we do not guarantee that the software
 works and are not responsible for any damage you may cause or have.
 
 
$Id: data.c 228 2006-01-19 19:13:57Z beerli $
 
-------------------------------------------------------*/
/*! \file data.c

Data manipulation routines

*/

#include <string.h>

#include "migration.h"
#include "sighandler.h"
#include "tools.h"
#include "migrate_mpi.h"
#include "data.h"
extern long number_genomes (char type);

#ifdef DMALLOC_FUNC_CHECK
#include <dmalloc.h>
#endif
/* prototypes ----------------------------------------- */
void create_data (data_fmt ** data);
void get_data (FILE * infile, data_fmt * data, option_fmt * options);
void print_data (world_fmt * world, option_fmt * options, data_fmt * data);
long find_missing(data_fmt *data, long pop, long locus);
void print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
                         data_fmt * data);
void free_datapart (data_fmt * data, option_fmt * options, long locus);
/*private functions */
void init_data_structure1 (data_fmt ** data, option_fmt * options);
void read_header (FILE * infile, data_fmt * data, option_fmt * options);
void read_sites (data_fmt * data);
void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop);
void init_data_structure3 (data_fmt * data);
void read_popheader (FILE * infile, data_fmt * data, long pop, long genomes);
void read_indname (FILE * file, data_fmt * data, long pop, long ind,
                   long nmlength);
void read_popdata (FILE * file, data_fmt * data, long pop,
                   option_fmt * options);
void read_microalleles (FILE * infile, data_fmt * data, long pop, long ind);
void read_alleles (FILE * infile, data_fmt * data, long pop, long ind);
long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
                   long locus, long pop, long ind, long baseread);
void read_distance_fromfile (FILE * dfile, long tips, long nmlength,
                             MYREAL **m);
void finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
                      long pop, long baseread);
void print_alleledata (world_fmt * world, data_fmt * data,
                       option_fmt * options);
void print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data);

void print_header (FILE * outfile, long pop, world_fmt * world,
                   option_fmt * options, data_fmt * data);
void create_alleles (data_fmt * data);
void addAllele (data_fmt * data, char s[], long locus, long *z);
void set_numind (data_fmt * data);
void print_seq_pop (long locus, long pop, world_fmt * world,
                    option_fmt * options, data_fmt * data);
void print_seq_ind (long locus, long pop, long ind, world_fmt * world,
                    option_fmt * options, data_fmt * data);
void print_locus_head (long locus, world_fmt * world, option_fmt * options,
                       data_fmt * data);
void find_delimiter (char *title, char *dlm);
void read_geofile (data_fmt * data, option_fmt * options, long numpop);
void read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
                        long *uepsites, long datatype);
void read_uepfile (data_fmt * data, option_fmt * options, long numpop);


/*=====================================================*/
/// creates the data structure
void
create_data (data_fmt ** data)
{
    (*data) = (data_fmt *) mycalloc (1, sizeof (data_fmt));
}

/*
void
init_data (data_fmt * data)
{
 
}
*/

///
/// free the data module 
void
destroy_data (data_fmt * data)
{
  long ind;
  long indalloc;
  long locus;
  long pop;
  long loci = data->loci;
  long numpop = data->numpop;

  // free data from init_data_structure3
  for (locus = 0; locus < loci; locus++)
    {
      free(data->allele[locus]);
    }
  free(data->allele);
  free(data->maxalleles);
  free(data->skiploci);

  // free data from init_data_structure2
  for(pop=0; pop < numpop ; pop++)
    {
      indalloc = -1;
      for(locus=0; locus < loci; locus++)
	{
	  if(indalloc < data->numind[pop][locus])
	    indalloc = data->numind[pop][locus];
	}
      for (ind = 0; ind < indalloc; ind++)
	{
	  free(data->indnames[pop][ind]);
	  free(data->yy[pop][ind]);
	}
      free(data->indnames[pop]);
      free(data->yy[pop]);
    }
  free(data->indnames);
  free(data->yy);

  // data->yy were already freed in free_datapart()

  // free data from init_structure_1
  free (data->popnames[0]);
  free (data->numind[0]);
  free (data->numalleles[0]);
  free (data->popnames);
  free (data->numind);
  free (data->numalleles);
  free (data->seq->sites);
  free (data->seq);
  free (data->geo);
  free (data->lgeo);
  if(data->ogeo != NULL)
    {
      free (data->ogeo[0]);
      free (data->ogeo);
    }
  free (data);
}
 


void
get_data (FILE * infile, data_fmt * data, option_fmt * options)
{
    long pop;
    long genomes=1;
    data->hasghost = FALSE;
    read_header (infile, data, options);
    genomes =  number_genomes (options->datatype);
    init_data_structure1 (&data, options);
    switch (options->datatype)
    {
    case 's':
    case 'f':
        read_sites (data);
        break;
    case 'n':
        read_sites (data);
        data->seq->addon = 4;
        break;
    case 'u':
        read_sites (data);
        data->seq->addon = 4;
        break;
    default:
        data->seq->fracchange = 1.0;
        break;
    }
    if (options->progress)
        fprintf (stdout, "\n\n");
    if (options->writelog)
        fprintf (options->logfile, "\n\n");
    for (pop = 0; pop < data->numpop; pop++)
    {
        read_popheader (infile, data, pop, genomes);
        if (options->progress)
            fprintf (stdout, "Reading %s ...\n", data->popnames[pop]);
        if (options->writelog)
            fprintf (options->logfile, "Reading %s ...\n", data->popnames[pop]);
        init_data_structure2 (&data, options, pop);
        read_popdata (infile, data, pop, options);
    }
    read_geofile (data, options, data->numpop);
#ifdef UEP

    read_uepfile (data, options, data->numpop);
#endif

    if (options->progress)
        fprintf (stdout, "\n\n");
    init_data_structure3 (data);

    switch (options->datatype)
    {
    case 'a':
        create_alleles (data);
        break;
    case 'b':
        create_alleles (data);
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = XBROWN_SIZE;
        break;
    case 'm':
        create_alleles (data);
        for (pop = 0; pop < data->loci; pop++)
            data->maxalleles[pop] = options->micro_stepnum;
        break;
    }
}

/* private functions ========================================== */

void
init_data_structure1 (data_fmt ** data, option_fmt * options)
{
    long pop;
    long numpop = (*data)->numpop;
    long loci   = (*data)->loci;
    
    (*data)->ogeo = NULL;
    (*data)->geo = NULL;
    if ((*data)->yy == NULL)
    {
        (*data)->yy = (char *****) mymalloc (sizeof (char ****) * numpop);
        (*data)->seq = (seqmodel_fmt *) mycalloc (1, sizeof (seqmodel_fmt));
        (*data)->popnames =(char **) mymalloc (sizeof (char *) * numpop);
        (*data)->popnames[0] =(char *) mycalloc (numpop * LINESIZE,sizeof(char));
        (*data)->indnames = (char ***) mymalloc (sizeof (char **) * numpop);
        (*data)->numind = (long **) mymalloc (sizeof (long *) * numpop);
        (*data)->numind[0] = (long *) mymalloc (sizeof (long) * numpop * loci);
        (*data)->numalleles = (long **) mymalloc (sizeof (long *) * numpop);
        (*data)->numalleles[0] = (long *) mymalloc (sizeof (long) * numpop * loci);

        for (pop = 1; pop < numpop; pop++)
        {
	  (*data)->popnames[pop] = (*data)->popnames[0] + pop * LINESIZE;
	  (*data)->numind[pop] = (*data)->numind[0] + pop * loci;
	  (*data)->numalleles[pop] =  (*data)->numalleles[0] + pop * loci;
        }
        (*data)->seq->sites = (long *) mycalloc (1, sizeof (long) * loci);
    }
    else
    {
        error ("Problem with initialization of data matrix yy\n");
    }
}


void
init_data_structure2 (data_fmt ** data, option_fmt * options, long pop)
{
    long ind, locus;
    long indalloc = -1;
    for(locus=0;locus<(*data)->loci;locus++)
    {
        if(indalloc < (*data)->numind[pop][locus])
            indalloc = (*data)->numind[pop][locus];
    }
    if (indalloc == 0)
        indalloc = 2;
    (*data)->yy[pop] = (char ****) mymalloc (sizeof (char ***) * indalloc);
    (*data)->indnames[pop] = (char **) mycalloc (1, sizeof (char *) * indalloc);
    for (ind = 0; ind < indalloc; ind++)
    {
        (*data)->indnames[pop][ind] =
            (char *) mycalloc (1, sizeof (char) * (1 + options->nmlength));
        (*data)->yy[pop][ind] =
            (char ***) mymalloc (sizeof (char **) * (*data)->loci);
        for (locus = 0; locus < (*data)->loci; locus++)
        {
            if (!strchr (SEQUENCETYPES, options->datatype))
            {
                (*data)->yy[pop][ind][locus] =
                    (char **) mycalloc (1, sizeof (char *) * 2);
                (*data)->yy[pop][ind][locus][0] =
                    (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
                (*data)->yy[pop][ind][locus][1] =
                    (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
            }
            else
            {
                (*data)->yy[pop][ind][locus] =
                    (char **) mycalloc (1, sizeof (char *));
                (*data)->yy[pop][ind][locus][0] =
                    (char *) mycalloc (1,
                                     sizeof (char) * ((*data)->seq->sites[locus]+1));
            }
        }
    }
}


void
free_datapart (data_fmt * data, option_fmt * options, long locus)
{
    long ind, pop;
    //  long genomes = number_genomes (options->datatype);
    for (pop = 0; pop < data->numpop; pop++)
    {
        for (ind = 0; ind < data->numind[pop][locus]; ind++)
        {
            if (!strchr (SEQUENCETYPES, options->datatype))
            {
                free (data->yy[pop][ind][locus][0]);
                free (data->yy[pop][ind][locus][1]);
                free (data->yy[pop][ind][locus]);
            }
            else
            {
                free (data->yy[pop][ind][locus][0]);
                free (data->yy[pop][ind][locus]);
            }
        }
    }
}


void
init_data_structure3 (data_fmt * data)
{
    long locus, pop, maxi;
    data->allele =
        (allele_fmt **) mycalloc (1, sizeof (allele_fmt *) * data->loci);
    for (locus = 0; locus < data->loci; locus++)
    {
        maxi = 0;
        for (pop = 0; pop < data->numpop; pop++)
            maxi += data->numalleles[pop][locus];
        data->allele[locus] =
            (allele_fmt *) mycalloc (1, sizeof (allele_fmt) * maxi);
    }
    data->maxalleles = (long *) mycalloc (1, sizeof (long) * data->loci);
    data->skiploci =
        (boolean *) mycalloc (1, sizeof (boolean) * (data->loci + 1));
}

///
/// read the first line of the data file
/// \param infile datafilename
/// \param data   data structure that holds all the data
/// \param options structure that contain all option information
/// \reval none
void read_header (FILE * infile, data_fmt * data, option_fmt * options)
{
    char input[LINESIZE], *p;
    char title[LINESIZE];
    strcpy(title,"\0");
    FGETS (input, sizeof (input), infile);
    if ((p = (char *) strpbrk (input, CRLF)) != NULL)
        *p = '\0';
    switch (lowercase (input[0]))
    {
    case 'a':
        sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
                &(data->loci), title);
        find_delimiter (title, &data->dlm);
	if(!(title[0]==' ' || title[0] == '\0'))
	  strcpy(options->title,title);
        break;
    case 'b':
    case 'm':
        sscanf (input, "%1s%ld%ld%1s%[^\n]", &options->datatype,
                &(data->numpop), &(data->loci), &data->dlm, title);
	if(!(title[0]==' ' || title[0] == '\0'))
	  strcpy(options->title,title);
        break;
    case 's':
    case 'n':
    case 'u':
    case 'f':
        sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
                &(data->loci), title);
	if(!(title[0]==' ' || title[0] == '\0'))
	  strcpy(options->title,title);
        break;
    case 'g':   /* fall through if a menu change forces to analyze data
                               instead of using the already sampled genealogies */
        if (options->datatype == 'g')
            break;
        else
            memmove (input, input + 1, (strlen (input) - 1) * sizeof (char));
    default:
        switch (options->datatype)
        {
        case 'a':
            sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
                    title);
            find_delimiter (title, &data->dlm);
	if(!(title[0]==' ' || title[0] == '\0'))
	  strcpy(options->title,title);
            break;
        case 'b':
        case 'm':
            sscanf (input, "%ld%ld%1s%[^\n]", &(data->numpop), &(data->loci),
                    &(data->dlm), title);
	    if(!(title[0]==' ' || title[0] == '\0'))
	      strcpy(options->title,title);
            break;
        case 's':
        case 'n':
        case 'u':
        case 'f':
            sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
                    title);
	    if(!(title[0]==' ' || title[0] == '\0'))
	      strcpy(options->title,title);
            break;
        default:
            usererror ("Datatype is wrong, please use a valid data type!");
        }
    }
    options->datatype = lowercase (options->datatype);
}

void
find_delimiter (char *title, char *dlm)
{
    char *p = title;
    long z = 0;
    while (*p == ' ')
    {
        p++;
        z++;
    }
    if (isalnum (*p))
        memmove (title, p, sizeof (char) * (strlen (title) - z));
    else
    {
        *dlm = *p;
        p++;
        while (*p == ' ')
        {
            p++;
            z++;
        }
        memmove (title, p, sizeof (char) * (strlen (title) - z));
    }
}


/*  old sites reader, will be obsolete once the new is working correctly
void
read_sites (data_fmt * data)
{
    long locus;
    char *input, *p, *a;
    input = (char *) mycalloc (LINESIZE, sizeof (char));
    FGETS (input, LINESIZE, data->infile);
    if ((p = (char *) strpbrk (input, CRLF)) != NULL)
        *p = '\0';
    p = input;
    for (locus = 0; locus < data->loci; locus++)
    {
        while (isspace ((int) *p))
            p++;
        if (locus == 0)
            a = strtok (p, " ");
        else
            a = strtok (NULL, " ");
        data->seq->sites[locus] = atoi (a);
        if (data->seq->sites[locus] == 0)
        {
            warning ("This does look like sequence data\n");
            warning ("I just read a number of sites=0\n");
            warning ("If you use the wrong data type, the program\n");
            usererror ("will crash anyway, so I stop now\n");
        }
    }
    free (input);

}
*/

//==================================================================
// read the number of sites for each locus in the dataset
// does not assume a fixed line length, but assumes that at the end of the line is either
// a \n or \r or \r\l (similar to the sequence reader) to accommodate windows, mac and
// unix line ends
void
read_sites (data_fmt * data)
{
    long locus;
    char *input;
    input = (char *) mycalloc(LINESIZE,sizeof(char));
    
    for (locus = 0; locus < data->loci-1; locus++)
    {
        read_word(data->infile, input);
        data->seq->sites[locus] = atoi (input);
        if (data->seq->sites[locus] == 0)
        {
            warning ("This does look like sequence data\n");
            warning ("I just read a number of sites=0\n");
            warning ("If you use the wrong data type, the program\n");
            usererror ("will crash anyway, so I stop now\n");
        }
    }
        FGETS(input,LINESIZE,data->infile);
        data->seq->sites[locus] = atoi (input);

    free (input);

}

/*
void
read_popheader (FILE * infile, data_fmt * data, long pop, long genomes)
{
    char input[SUPERLINESIZE], *p, *tmp;
    long locus,lo;
    boolean havepopname=FALSE;
    FGETS (input, sizeof (input), infile);
    if ((p = (char *) strpbrk (input, CRLF)) != NULL)
        *p = '\0';

    // allows that sequence data can have different numbers of individuals for different loci
    // data syntax changes: #ind1 #ind2 #IND3 .... pop_name
    havepopname=FALSE;
    if(data->loci>1)
    {
        tmp = strtok (input, " ");
        data->numind[pop][0] = atol(tmp);
        data->numalleles[pop][0] = data->numind[pop][0] * genomes;
        for(locus=1; locus< data->loci; locus++)
        {
            tmp = strtok(NULL," ");
            if(tmp!=NULL)
            {
                if(isdigit(tmp[0]))
                {
                    data->numind[pop][locus] = atol(tmp);
                    data->numalleles[pop][locus] = data->numind[pop][locus]*genomes;
                }
                else
                {
                    havepopname=TRUE;
                    strncpy(data->popnames[pop],tmp,MIN(strlen(tmp),80));
                    break;
                }
            }
        }
        if(!havepopname)
        {
            tmp = strtok(NULL," ");
            if(tmp!=NULL)
                strncpy(data->popnames[pop],tmp,MIN(strlen(tmp),80));
        }

        // fills numind for additional locus in case the numind was not specified
        for(lo=locus; lo< data->loci; lo++)
        {
            data->numind[pop][lo] = data->numind[pop][locus-1];
            data->numalleles[pop][lo] = data->numind[pop][lo] * genomes;
        }
	}
    else
    {
        // only one locus so we can use old scheme [see below]
        sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]),
                data->popnames[pop]);
        data->numalleles[pop][0] = data->numind[pop][0]*genomes;
    }

    translate (data->popnames[pop], ' ', '_');
    translate (data->popnames[pop], '\t', '_');
    //trial 
    //  for(locus=0;locus<data->loci; locus++)
    //    {
    //      if (data->numind[pop][locus] == 0)
    //        data->hasghost = FALSE ;//TRUE;
    //    }
}
*/

void
read_popheader (FILE * infile, data_fmt * data, long pop, long genomes)
{
    long lo;
    boolean havepopname=FALSE;
    long locus;
    char *input;
    input = (char *) mycalloc(LINESIZE,sizeof(char));

    // allows that sequence data can have different numbers of individuals for different loci
    // data syntax changes: #ind1 #ind2 #IND3 .... pop_name
    havepopname=FALSE;
    if(data->loci>1)
    {
        read_word(data->infile, input);
        data->numind[pop][0] = atol(input);
        data->numalleles[pop][0] = data->numind[pop][0] * genomes;
        for(locus=1; locus< data->loci; locus++)
        {
            read_word(infile, input);
            if(isdigit(input[0]))
            {
                data->numind[pop][locus] = atol(input);
                data->numalleles[pop][locus] = data->numind[pop][locus]*genomes;
            }
            else
            {
                unread_word(infile, input);
                FGETS(input,LINESIZE,infile);
                havepopname=TRUE;
                strncpy(data->popnames[pop],input,MIN(strlen(input),80));
                break;
            }
        }
        if(!havepopname)
        {
            read_word(infile, input);
            strncpy(data->popnames[pop],input,MIN(strlen(input),80));
        }

        // fills numind for additional locus in case the numind was not specified
        for(lo=locus; lo< data->loci; lo++)
        {
            data->numind[pop][lo] = data->numind[pop][locus-1];
            data->numalleles[pop][lo] = data->numind[pop][lo] * genomes;
        }
    }
    else
    {
        // only one locus so we can use old scheme [see below]
        FGETS(input,LINESIZE,infile);
        sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]),
                data->popnames[pop]);
        data->numalleles[pop][0] = data->numind[pop][0]*genomes;
    }

    translate (data->popnames[pop], ' ', '_');
    translate (data->popnames[pop], '\t', '_');
    free(input);
}


void
read_indname (FILE * file, data_fmt * data, long pop, long ind, long nmlength)
{
    long i = 0;
    char ch;
    while (i < nmlength)
    {
        ch = getc (file);
        if(!strchr("\r\n",ch))
            data->indnames[pop][ind][i++] = ch;
        if(strchr("\t",ch))
            break;
    }
    data->indnames[pop][ind][nmlength] = '\0';
}

void
read_popdata (FILE * infile, data_fmt * data, long pop, option_fmt * options)
{
    long ind, baseread = 0;
    long locus = 0;
    for (ind = 0; ind < data->numind[pop][0]; ind++)
    {
        read_indname (infile, data, pop, ind, options->nmlength);
        switch (options->datatype)
        {
        case 'a':
        case 'b':
        case 'm':
            if (data->dlm == '\0')
                read_alleles (infile, data, pop, ind);
            else
                read_microalleles (infile, data, pop, ind);
            break;
        case 's':
        case 'n':
        case 'u':
        case 'f':
            baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
            break;
        default:
            usererror
            ("Wrong datatype, only the types a, m, s, n\n       (electrophoretic alleles, \n       microsatellite data,\n       sequence data,\n       SNP polymorphism)\n        are allowed.\n");
            break;
        }
    }
    if (!strchr (SEQUENCETYPES, options->datatype))
        return;
    else
    {
        finish_read_seq (infile, data, options, pop, baseread);
    }
}

void
read_microalleles (FILE * infile, data_fmt * data, long pop, long ind)
{
    char *input, *isave, dlm[2], ddlm[2], *p, *a, *a1, *a2;
    long locus, i;
    input = (char *) mycalloc (1, sizeof (char) * (SUPERLINESIZE + 1));
    isave = input;
    a = (char *) mycalloc (1, sizeof (char) * LINESIZE);
    a1 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
    a2 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
    dlm[0] = data->dlm, dlm[1] = '\0';
    ddlm[0] = ' ', ddlm[1] = '\0';
    FGETS (input, SUPERLINESIZE, infile);
    if ((p = (char *) strpbrk (input, CRLF)) != NULL)
        *p = '\0';
    for (locus = 0; locus < data->loci; locus++)
    {
        while (isspace ((int) *input))
            input++;
        if (input[0] == '\0')
            FGETS (input, SUPERLINESIZE, infile);
        i = 0;
        while (strchr(" \t",input[i])==NULL && input[i] != dlm[0])
        {
            a1[i] = input[i];
            i++;
        }
        a1[i] = '\0';
        input += i;
        i = 0;
        if (input[i] == dlm[0])
        {
            input++;
            while (strchr(" \t",input[i])==NULL && input[i] != '\0')
            {
                a2[i] = input[i];
                i++;
            }
            a2[i] = '\0';
            if (a2[0] == '\0')
            {
                strcpy (a2, a1);
            }
            input += i;
        }
        else
        {
            strcpy (a2, a1);
        }
        strcpy (data->yy[pop][ind][locus][0], a1);
        strcpy (data->yy[pop][ind][locus][1], a2);
    }
    free (a);
    free (a1);
    free (a2);
    free (isave);
}

void
read_alleles (FILE * infile, data_fmt * data, long pop, long ind)
{
    char *input, *isave, *p, *a;
    long locus;
    a = (char *) mycalloc (1, sizeof (char) * LINESIZE);

    input = (char *) mycalloc (1, sizeof (char) * SUPERLINESIZE);
    isave = input;
    FGETS (input, SUPERLINESIZE, infile);
    if ((p = (char *) strpbrk (input, CRLF)) != NULL)
        *p = '\0';
    for (locus = 0; locus < data->loci; locus++)
    {
        while (isspace ((int) *input))
        {
            input++;
        }
        if (sscanf (input, "%s", a) == 1)
        {
            input += (long) strlen (a);
        }

        data->yy[pop][ind][locus][0][0] = a[0];
        data->yy[pop][ind][locus][0][1] = '\0';
        if (a[1] == '\0')
        {
            data->yy[pop][ind][locus][1][0] = a[0];
            data->yy[pop][ind][locus][1][1] = '\0';
        }
        else
        {
            data->yy[pop][ind][locus][1][0] = a[1];
            data->yy[pop][ind][locus][1][1] = '\0';
        }
    }
    free (a);
    free (isave);
}

long
read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
              long locus, long pop, long ind, long baseread)
{
    long j;
    char charstate;
    j = (options->interleaved) ? baseread : 0;
    charstate = getc (infile);
    ungetc ((int) charstate, infile);
    while (j < data->seq->sites[locus]
            && !(options->interleaved && strchr(CRLF,charstate)))
    {
        charstate = getc (infile);
        if (strchr(CRLF,charstate))
        {
            if (options->interleaved)
            {
                while(strchr(CRLF,charstate=getc(infile)))
                    ;
                ungetc ((int) charstate, infile);
                return j;
            }
            else
                charstate = ' ';
        }
        if (charstate == ' '
                || (charstate >= '0' && charstate <= '9') || charstate == '\\')
            continue;
        charstate = uppercase (charstate);
        //      printf("%c",charstate);
        if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
        {
            printf
            ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
             charstate, j, ind, pop);
            printf
            ("Last complete sequences was in population %li, individual %li and locus %li:\n%s",
             pop + 1, ind - 1, locus, data->indnames[pop][ind - 1]);
            for (j = 0; j < data->seq->sites[locus]; j++)
                printf ("%c", data->yy[pop][ind - 1][locus][0][j]);
            exit (EXIT_FAILURE);
        }
        data->yy[pop][ind][locus][0][j++] = charstate;
    }
    charstate = getc (infile); /* swallow the \n or \r */
#ifndef MAC

    if (charstate == '\r')
    {
        charstate = getc (infile); /* swallow the \n */
        if(charstate!='\n')
            ungetc((int) charstate, infile);
    }
#endif
    return j;
}

void
read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m)
{
    char input[SUPERLINESIZE];
    long i, j;

    if (dfile != NULL)
    {
        // assumes that the dfile is in PHYLIP format
        // and that all n x n cells are filled.
        FGETS (input, LONGLINESIZE, dfile); //reads header line with
        for (i = 0; i < tips; i++) // of individuals
        {
            //reads header line with
            FGETS (input, nmlength + 1, dfile);
            for (j = 0; j < tips; j++)
            {
#ifdef USE_MYREAL_FLOAT
                fscanf (dfile, "%f", &m[i][j]);
#else
                fscanf (dfile, "%lf", &m[i][j]);
#endif
            }
            // reads the last \n from the
            // data matrix
            FGETS (input, LONGLINESIZE, dfile);
        }
    }
}

#ifdef UEP
// uep function

void
read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
                   long *uepsites, long datatype)
{
    char input[LINESIZE];
    long i, j;
    long thistips;
    if (uepfile != NULL)
    {
        // assumes that the uepfile is in PHYLIP format
        // and that all n cells are filled.
        FGETS (input, LINESIZE, uepfile); //reads header line with
        // of individuals and uep sites
        sscanf (input, "%li%li", &thistips, uepsites);
        if (thistips != tips)
            error ("UEP datafile and infile are inconsistent!");
        if (strchr (SEQUENCETYPES, datatype))
        {
            for (i = 0; i < tips; i++)
            {
                uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
                FGETS (input, nmlength + 1, uepfile); //reads each line
                for (j = 0; j < *uepsites; ++j)
                    fscanf (uepfile, "%i", &uep[i][j]);
                // reads the last \n from the data matrix
                FGETS (input, LINESIZE, uepfile);
            }
        }
        else
        {
            for (i = 0; i < tips; i++)
            {
                uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
                uep[i + tips] = (int *) mycalloc (*uepsites, sizeof (int));
                FGETS (input, nmlength + 1, uepfile); //reads each line
                for (j = 0; j < *uepsites; ++j)
                    fscanf (uepfile, "%i", &uep[i][j]);
                // finished reading first allele, no onto the second
                for (j = 0; j < *uepsites; ++j)
                    fscanf (uepfile, "%i", &uep[i + tips][j]);
                // reads the last \n from the data matrix
                FGETS (input, LINESIZE, uepfile);
            }
        }
    }
}
#endif

void
finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
                 long pop, long baseread)
{

    long ind, baseread2 = 0, locus = 0;
    if (options->interleaved)
    {
        while (baseread < data->seq->sites[0])
        {
            for (ind = 0; ind < data->numind[pop][0]; ind++)
            {
                baseread2 =
                    read_ind_seq (infile, data, options, locus, pop, ind,
                                  baseread);
            }
            baseread = baseread2;
        }
    }
    for (locus = 1; locus < data->loci; locus++)
    {
        baseread = 0;
        for (ind = 0; ind < data->numind[pop][locus]; ind++)
        {
            read_indname (infile, data, pop, ind, options->nmlength);
            baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
        }
        if (options->interleaved)
        {
            while (baseread < data->seq->sites[locus])
            {
                for (ind = 0; ind < data->numind[pop][locus]; ind++)
                {
                    baseread2 =
                        read_ind_seq (infile, data, options, locus, pop, ind,
                                      baseread);
                }
                baseread = baseread2;
            }
        }
    }
}

long find_missing(data_fmt *data, long pop, long locus)
{
    long missing = 0;
    long ind;
    for(ind=0; ind < data->numind[pop][locus]; ind++)
    {
        if(data->yy[pop][ind][locus][0][0]=='?')
            missing++;
        if(data->yy[pop][ind][locus][1][0]=='?')
            missing++;
    }
    return missing;
}

void
print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
                    data_fmt * data)
{
    long locus;
    long pop;
    long numind;
    long nummiss;
    char dstring[LINESIZE];
    long *total;
    long *totalmiss;
    total = (long *) mycalloc(data->loci,sizeof(long));
    totalmiss = (long *) mycalloc(data->loci,sizeof(long));
    fprintf (file, "Summary of data:\n");
    fprintf (file, "---------------\n");
    switch (options->datatype)
    {
    case 'a':
        strcpy (dstring, "Allelic data");
        break;
    case 'f':
    case 's':
        strcpy (dstring, "Sequence data");
        break;
    case 'b':
    case 'm':
        strcpy (dstring, "Microsatellite data");
        break;
    case 'n':
    case 'u':
        strcpy (dstring, "SNP data");
        break;
    default:
        strcpy (dstring, "Unknown data [ERROR]");
        break;
    }
    fprintf (file, "Datatype:                                %20s\n", dstring);
    fprintf (file, "Number of loci:                          %20li\n\n",
             data->loci);

    if (!strchr (SEQUENCETYPES, options->datatype))
      {
        fprintf (file,
                 "Population                                    Locus  Gene copies\n");
        fprintf (file,
             "                                                     data (missing)\n");
    }
    else
        fprintf (file,
             "Population                                    Locus  Gene copies\n");
    fprintf (file,
             "----------------------------------------------------------------\n");
    for (pop = 0; pop < world->numpop; pop++)
    {
        if (!strchr (SEQUENCETYPES, options->datatype))
          {
            nummiss = find_missing(data,pop,0);
            numind = data->numalleles[pop][0] - nummiss;
            fprintf (file, "%3li %-40.40s %5li %6li (%li)\n", pop+1, data->popnames[pop], 1L , numind, nummiss);
          }
        else
          {
            nummiss = 0;
            numind = data->numind[pop][0];
            fprintf (file, "%3li %-40.40s %5li %6li\n", pop+1, data->popnames[pop], 1L , numind);
          }
        total[0] += numind;
        totalmiss[0] += nummiss;
        
        for(locus=1; locus< data->loci; locus++)
        {
            if (!strchr (SEQUENCETYPES, options->datatype))
              {
                nummiss = find_missing(data,pop,locus);
                numind = data->numalleles[pop][locus] - nummiss;
                fprintf (file,"                                             %5li %6li (%li)\n",  locus+1, numind, nummiss);
              }
            else
              {
                nummiss=0;
                numind = data->numind[pop][locus];
                fprintf (file,"                                             %5li %6li\n",  locus+1, numind);
              }
            total[locus] += numind;
            totalmiss[locus] += nummiss;
        }
    }
    if (!strchr (SEQUENCETYPES, options->datatype))
      {
        fprintf (file,"Total of all populations                     %5li %6li (%li)\n",1L, total[0], totalmiss[0]);
        for(locus=1; locus< data->loci; locus++)
          {
            fprintf (file,"                                             %5li %6li (%li)\n",locus+1, total[locus], totalmiss[locus]);
          }
      }
    else
      {
        fprintf (file,"Total of all populations                     %5li %6li\n",1L, total[0]);
        for(locus=1; locus< data->loci; locus++)
          {
            fprintf (file,"                                             %5li %6li\n",locus+1, total[locus]);
          }
      }    
    fprintf(file,"\n");
    free(total);
    free(totalmiss);
    fflush (file);
}

void
print_data (world_fmt * world, option_fmt * options, data_fmt * data)
{
    if (options->printdata)
    {
        switch (options->datatype)
        {
        case 'a':
        case 'b':
        case 'm':
            print_alleledata (world, data, options);
            break;
        case 's':
        case 'n':
        case 'u':
        case 'f':
            print_seqdata (world, options, data);
            break;
        }
    }
}

void
print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options)
{
    long i, pop, ind, locus, mult80;
    for (pop = 0; pop < data->numpop; pop++)
    {
        print_header (world->outfile, pop, world, options, data);
        for (ind = 0; ind < data->numind[pop][0]; ind++)
        {
            fprintf (world->outfile, "%-*.*s ", (int) options->nmlength,
                     (int) options->nmlength, data->indnames[pop][ind]);
            mult80 = options->nmlength;
            for (locus = 0; locus < data->loci; locus++)
            {
                mult80 +=
                    1 + (long) (strlen (data->yy[pop][ind][locus][0]) +
                    strlen (data->yy[pop][ind][locus][1]));
                if (mult80 >= 80)
                {
                    mult80 = 0;
                    fprintf (world->outfile, "\n");
                    for (i = 0; i < options->nmlength; i++)
                        FPRINTF(world->outfile, " ");
                }
                fprintf (world->outfile, " %s.%-s",
                         data->yy[pop][ind][locus][0],
                         data->yy[pop][ind][locus][1]);
            }
            fprintf (world->outfile, "\n");
        }
        fprintf (world->outfile, "\n");
    }
    fprintf (world->outfile, "\n\n");
    fflush (world->outfile);
}

void
print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data)
{
    long pop, locus;
    for (pop = 0; pop < data->numpop; pop++)
    {
        print_header (world->outfile, pop, world, options, data);
        for (locus = 0; locus < data->loci; locus++)
        {
            print_locus_head (locus, world, options, data);
            print_seq_pop (locus, pop, world, options, data);
        }
    }
    fflush (world->outfile);
}

void
print_header (FILE * outfile, long pop, world_fmt * world,
              option_fmt * options, data_fmt * data)
{
    long i;
    long locus, mult80 = 80;
    char input[LINESIZE];
    fprintf (outfile, "\n%-s", data->popnames[pop]);
    for (i = 0; i < (long) (80 - (long) strlen (data->popnames[pop])); i++)
         fprintf(world->outfile, "-");
    fprintf (outfile, "\n\n");
    if (!strchr (SEQUENCETYPES, options->datatype))
    {
        fprintf (outfile, "%-s  ", (data->loci == 1 ? "locus" : "loci "));
        for (i = 0; i < (options->nmlength - 6); i++)
            fprintf(world->outfile, " ");
        for (locus = 0; locus < data->loci; locus++)
        {
            if (locus * 4 + options->nmlength > mult80)
            {
                mult80 += 80;
                fprintf (outfile, "\n");
                for (i = 0; i < options->nmlength; i++)
                    fprintf (outfile, " ");
            }
            fprintf (outfile, "  %2li", locus + 1);
        }
        fprintf (outfile, "\n%-s\n",
                 strncpy (input, "indiv.", options->nmlength));
    }
}



void
create_alleles (data_fmt * data)
{
    long locus, pop, ind;
    long z;
    char a1[DEFAULT_ALLELENMLENGTH];
    char a2[DEFAULT_ALLELENMLENGTH];
    for (locus = 0; locus < data->loci; locus++)
    {
        z = 0;
        for (pop = 0; pop < data->numpop; pop++)
        {
            for (ind = 0; ind < data->numind[pop][locus]; ind++)
            {
                strcpy (a1, data->yy[pop][ind][locus][0]);
                strcpy (a2, data->yy[pop][ind][locus][1]);
                if (!strcmp (a1, a2))
                {
                    addAllele (data, a1, locus, &z);
                }
                else
                {
                    addAllele (data, a1, locus, &z);
                    addAllele (data, a2, locus, &z);
                }
            }
        }
        data->maxalleles[locus] = z + 1;
        /* + 1: for all the unencountered alleles */
    }
}

void
addAllele (data_fmt * data, char s[], long locus, long *z)
{
    long found = 0;
    while ((data->allele[locus][found++][0] != '\0')
            && (strcmp (s, data->allele[locus][found - 1])))
        ;
    if (found > (*z))
    {
        strcpy (data->allele[locus][*z], s);
        (*z)++;
    }
}

void
set_numind (data_fmt * data)
{
    long locus, pop;
    for (locus = 1; locus < data->loci; locus++)
    {
        for (pop = 0; pop < data->numpop; pop++)
        {
            data->numind[pop][locus] = data->numind[pop][0];
            data->numalleles[pop][locus] = data->numalleles[pop][0];
        }
    }
}


void
print_seq_pop (long locus, long pop, world_fmt * world, option_fmt * options,
               data_fmt * data)
{
    long ind;
    for (ind = 0; ind < data->numalleles[pop][locus]; ind++)
    {
        print_seq_ind (locus, pop, ind, world, options, data);
    }
}

void
print_seq_ind (long locus, long pop, long ind, world_fmt * world,
               option_fmt * options, data_fmt * data)
{
    long site;
    char blank[2] = " ";
    fprintf (world->outfile, "%-*.*s", (int) options->nmlength,
             (int) options->nmlength, data->indnames[pop][ind]);
    fprintf (world->outfile, " %c", data->yy[pop][ind][locus][0][0]);
    for (site = 1; site < data->seq->sites[locus]; site++)
    {
        if ((site) % 60 == 0)
        {
            fprintf (world->outfile, "\n%-*.*s %c", (int) options->nmlength,
                     (int) options->nmlength, blank,
                     data->yy[pop][ind][locus][0][site]);
        }
        else
        {
            if ((site) % 10 == 0)
            {
                fprintf (world->outfile, " ");
            }
            fprintf (world->outfile, "%c", data->yy[pop][ind][locus][0][site]);
        }
    }
    fprintf (world->outfile, "\n");
}


void
print_locus_head (long locus, world_fmt * world, option_fmt * options,
                  data_fmt * data)
{
    char *head;
    head = (char *) mycalloc (1, sizeof (char) * MAX (10, options->nmlength));
    sprintf (head, "Locus %li", locus);
    fprintf (world->outfile, "%-*.*s --------10 --------20 --------30",
             (int) options->nmlength, (int) options->nmlength, head);
    fprintf (world->outfile, " --------40 --------50 --------60\n");

    free (head);
}

void
read_geofile (data_fmt * data, option_fmt * options, long numpop)
{
    long i, j, pop;
    long numpop2 = numpop * numpop;
    data->geo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
    data->lgeo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
    if (!options->geo)
    {
        for (i = 0; i < numpop2; i++)
            data->geo[i] = 1.0;
    }
    else
    {
        data->ogeo = (MYREAL **) mycalloc (1, sizeof (MYREAL *) * numpop);
        data->ogeo[0] = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
        for (pop = 1; pop < numpop; pop++)
            data->ogeo[pop] = data->ogeo[0] + numpop * pop;
        read_distance_fromfile (data->geofile, numpop, options->nmlength,
                                data->ogeo);
        for (i = 0; i < numpop; i++)
        {
            for (j = 0; j < numpop; j++)
            {
                if(i!=j)
                {
                    data->geo[mm2m (i, j, numpop)] =   1. / data->ogeo[i][j];
                    data->lgeo[mm2m (i, j, numpop)] =  data->ogeo[i][j] > 0.0 ?
                                                       log (1. / data->ogeo[i][j]) : -DBL_MAX;
                }
            }
        }
    }
}

#ifdef UEP
void
read_uepfile (data_fmt * data, option_fmt * options, long numpop)
{
    long i;
    long sumtips = 0;

    if (!options->uep)
        return;

    for (i = 0; i < numpop; ++i)
        sumtips += data->numind[i][0];   //Assumes that UEP has the same number of individuals as
    // locus 1 (Is this OK? most dataset with UEP will have 1 locus?????)
    data->uep = (int **) mycalloc (number_genomes (options->datatype) * sumtips,
                                 sizeof (int *));
    read_uep_fromfile (data->uepfile, sumtips, options->nmlength, data->uep,
                       &data->uepsites, options->datatype);
}

#endif
