











/*------------------------------------------------------
 Maximum likelihood estimation 
 of migration rate  and effectice population size
 using a Metropolis-Hastings Monte Carlo algorithm                            
 -------------------------------------------------------                        
 D A T A   R O U T I N E S 

 creates data structures,
 read data (Electrophoretic loci, sequences, microsats),
 feeds data into tree (?),
 prints data,
 destroys data.
 
 
 Theta(1)=4 N(1)mu, Theta(2)=4 N(2)mu,
 M(1) = m(1)/mu, and M(2)= m(2)/mu
                                                                                                               
 Peter Beerli 1996, Seattle
 beerli@genetics.washington.edu
 $Id: data.c,v 1.1.1.1 1998/06/06 06:09:51 beerli Exp $
-------------------------------------------------------*/
#include <string.h>

#include "migration.h"
#include "data.h"

#ifdef DMALLOC_FUNC_CHECK
#include "dmalloc.h"
#endif
/* prototypes ----------------------------------------- */
void create_data (data_fmt ** data);
void get_data (FILE * infile, data_fmt * data, option_fmt * options);
void print_data (world_fmt * world, data_fmt * data, option_fmt * options);
void print_data_summary (FILE * file, world_fmt * world);
void free_datapart (data_fmt * data, option_fmt * options, long locus);
/*private functions */
void init_data_structure1 (data_fmt ** data, option_fmt * options);
void read_header (FILE * infile, data_fmt * data, option_fmt * options);
void read_sites (data_fmt * data);
void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop);
void init_data_structure3 (data_fmt * data);
void read_popheader (FILE * infile, data_fmt * data, long pop);
void read_indname (FILE * file, data_fmt * data, long pop, long ind,
		   long nmlength);
void read_popdata (FILE * file, data_fmt * data, long pop,
		   option_fmt * options);
void read_microalleles (FILE * infile, data_fmt * data, long pop, long ind);
void read_alleles (FILE * infile, data_fmt * data, long pop, long ind);
long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
		   long locus, long pop, long ind, long baseread);
void finish_read_seq (FILE * infile, data_fmt * data,
		      option_fmt * options, long pop, long baseread);
void print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options);
void print_seqdata (world_fmt * world, data_fmt * data);
void print_header (FILE * outfile, long pop, world_fmt * world);
void create_alleles (data_fmt * data);
void addAllele (data_fmt * data, char s[], long locus, long *z);
void set_numind (data_fmt * data);
void print_seq_pop (long locus, long pop, world_fmt * world);
void print_seq_ind (long locus, long pop, long ind, world_fmt * world);
void print_locus_head (long locus, world_fmt * world);
/*=====================================================*/
void
create_data (data_fmt ** data)
{
  (*data) = (data_fmt *) calloc (1, sizeof (data_fmt));
}

void
init_data (data_fmt * data)
{

}

void
destroy_data (data_fmt * data)
{
  free (data);
}



void
get_data (FILE * infile, data_fmt * data, option_fmt * options)
{
  long pop;
  read_header (infile, data, options);
  init_data_structure1 (&data, options);
  if (options->datatype == 's')
    read_sites (data);
  else
    data->seq->fracchange = 1.0;
  for (pop = 0; pop < data->numpop; pop++)
    {
      read_popheader (data->infile, data, pop);
      init_data_structure2 (&data, options, pop);
      read_popdata (data->infile, data, pop, options);
    }
  init_data_structure3 (data);
  set_numind (data);		/* replace this, if loci can have different number of ind */
  switch (options->datatype)
    {
    case 'a':
      create_alleles (data);
      break;
    case 'b':
    case 'm':
      create_alleles (data);
      for (pop = 0; pop < data->loci; pop++)
	data->maxalleles[pop] = options->micro_stepnum;
      break;
    }
}

/* private functions ========================================== */

void
init_data_structure1 (data_fmt ** data, option_fmt * options)
{
  long pop;
  if ((*data)->yy == NULL)
    {
      (*data)->yy = (char *****) malloc (sizeof (char ****) * (*data)->numpop);
      (*data)->seq = (seqmodel_fmt *) malloc (sizeof (seqmodel_fmt));
      (*data)->popnames = (char **) malloc (sizeof (char *) * (*data)->numpop);
      (*data)->indnames = (char ***) malloc (sizeof (char **) * (*data)->numpop);
      (*data)->numind = (long **) malloc (sizeof (long *) * (*data)->numpop);
      for (pop = 0; pop < (*data)->numpop; pop++)
	{
	  (*data)->popnames[pop] = (char *) malloc (sizeof (char) *
						    options->popnmlength);
	  (*data)->numind[pop] = (long *) malloc (sizeof (long) * (*data)->loci);
	}
/*     if(options->datatype=='s') */
      (*data)->seq->sites = (long *) calloc (1, sizeof (long) * (*data)->loci);
/*     else */
/*       (*data)->seq->sites = (long *) calloc(1,sizeof(long)); */
    }
  else
    {
      fprintf (stderr,
	       "data->yy is obviously initialized, and should not be!?\n");
      exit (EXIT_FAILURE);
    }
}


void
init_data_structure2 (data_fmt ** data, option_fmt * options, long pop)
{
  long ind, locus;
  (*data)->yy[pop] = (char ****) malloc (sizeof (char ***) * (*data)->numind[pop][FLOC]);
  (*data)->indnames[pop] = (char **) calloc (1, sizeof (char *) * (*data)->numind[pop][FLOC]);
  for (ind = 0; ind < (*data)->numind[pop][FLOC]; ind++)
    {
      (*data)->indnames[pop][ind] = (char *) calloc (1, sizeof (char) * (1 + options->nmlength));
      (*data)->yy[pop][ind] = (char ***) malloc (sizeof (char **) * (*data)->loci);
      for (locus = 0; locus < (*data)->loci; locus++)
	{
	  if (options->datatype != 's')
	    {
	      (*data)->yy[pop][ind][locus] = (char **) calloc (1, sizeof (char *) * 2);
	      (*data)->yy[pop][ind][locus][0] =
		(char *) calloc (1, sizeof (char) * options->allelenmlength);
	      (*data)->yy[pop][ind][locus][1] =
		(char *) calloc (1, sizeof (char) * options->allelenmlength);
	    }
	  else
	    {
	      (*data)->yy[pop][ind][locus] = (char **) calloc (1, sizeof (char *));
	      (*data)->yy[pop][ind][locus][0] =
		(char *) calloc (1, sizeof (char) * (*data)->seq->sites[locus]);
	    }
	}
    }
}


void
free_datapart (data_fmt * data, option_fmt * options, long locus)
{
  long ind, pop, genomes;
  if (options->datatype == 's')
    genomes = 1;
  else
    genomes = 2;
  for (pop = 0; pop < data->numpop; pop++)
    {
      for (ind = 0; ind < data->numind[pop][locus] / genomes; ind++)
	{
	  if (options->datatype != 's')
	    {
	      free (data->yy[pop][ind][locus][0]);
	      free (data->yy[pop][ind][locus][1]);
	      free (data->yy[pop][ind][locus]);
	    }
	  else
	    {
	      free (data->yy[pop][ind][locus][0]);
	      free (data->yy[pop][ind][locus]);
	    }
	}
    }
}


void
init_data_structure3 (data_fmt * data)
{
  long locus, pop, maxi;
  data->allele = (allele_fmt **)
    calloc (1, sizeof (allele_fmt *) * data->loci);
  for (locus = 0; locus < data->loci; locus++)
    {
      maxi = 0;
      for (pop = 0; pop < data->numpop; pop++)
	maxi += data->numind[pop][FLOC] * 2;
      data->allele[locus] = (allele_fmt *)
	calloc (1, sizeof (allele_fmt) * maxi);
    }
  data->maxalleles = (long *) calloc (1, sizeof (long) * data->loci);
  data->skiploci = (boolean *) calloc (1, sizeof (boolean) * data->loci);
}


void
read_header (FILE * infile, data_fmt * data, option_fmt * options)
{
  char input[LINESIZE], *p;
  fgets (input, sizeof (input), infile);
  if ((p = (char *) strchr (input, '\n')) != NULL)
    *p = '\0';
  switch (lowercase (input[0]))
    {
    case 'a':
      sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype,
	      &(data->numpop), &(data->loci), options->title);
      break;
    case 'b':
    case 'm':
      sscanf (input, "%1s%ld%ld%1s%[^\n]", &options->datatype,
	      &(data->numpop), &(data->loci), &data->dlm, options->title);
      break;
    case 's':
      sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype,
	      &(data->numpop), &(data->loci), options->title);
      break;
    default:
      switch (options->datatype)
	{
	case 'a':
	  sscanf (input, "%ld%ld%[^\n]", &(data->numpop),
		  &(data->loci), options->title);
	  break;
	case 'b':
	case 'm':
	  sscanf (input, "%ld%ld%1s%[^\n]", &(data->numpop),
		  &(data->loci), &(data->dlm), options->title);
	  break;
	case 's':
	  sscanf (input, "%ld%ld%[^\n]", &(data->numpop),
		  &(data->loci), options->title);
	  break;
	}
    }
  options->datatype = lowercase (options->datatype);
}

void
read_sites (data_fmt * data)
{
  long locus;
  char input[LINESIZE], *p, *a;
  fgets (input, sizeof (input), data->infile);
  if ((p = (char *) strchr (input, '\n')) != NULL)
    *p = '\0';
  for (locus = 0; locus < data->loci; locus++)
    {
      while (isspace ((int) *input))
	(*input)++;
      if (locus == 0)
	a = strtok (input, " ");
      else
	a = strtok (NULL, " ");
      data->seq->sites[locus] = atoi (a);
    }
}

void
read_popheader (FILE * infile, data_fmt * data, long pop)
{
  char input[LINESIZE], *p;
  fgets (input, sizeof (input), infile);
  if ((p = (char *) strchr (input, '\n')) != NULL)
    *p = '\0';
  sscanf (input, "%ld%[^\n]", &(data->numind[pop][FLOC]), data->popnames[pop]);
  translate (data->popnames[pop], ' ', '_');

}

void
read_indname (FILE * file, data_fmt * data, long pop, long ind, long nmlength)
{
  long i = 0;
  while (i < nmlength)
    data->indnames[pop][ind][i++] = getc (file);
  data->indnames[pop][ind][nmlength] = '\0';
}

void
read_popdata (FILE * infile, data_fmt * data, long pop, option_fmt * options)
{
  long ind, baseread = 0;
  long locus = 0;
  for (ind = 0; ind < data->numind[pop][FLOC]; ind++)
    {
      read_indname (infile, data, pop, ind, options->nmlength);
      switch (options->datatype)
	{
	case 'a':
	  read_alleles (infile, data, pop, ind);
	  break;
	case 'b':
	case 'm':
	  if (data->dlm == '\0')
	    read_alleles (infile, data, pop, ind);
	  else
	    read_microalleles (infile, data, pop, ind);
	  break;
	case 's':
	  baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
	  break;
	default:
	  fprintf (stderr, "Wrong datatype, only the types a, m, s");
	  fprintf (stderr, " (electrophoretic alleles, \nmicrosatellite data, ");
	  fprintf (stderr, "sequence data) are allowed.\n");
	  exit (EXIT_FAILURE);
	  break;
	}
    }
  if (options->datatype != 's')
    return;
  else
    {
      finish_read_seq (infile, data, options, pop, baseread);
    }
}

void
read_microalleles (FILE * infile, data_fmt * data, long pop, long ind)
{
  char *input, *isave, dlm[2], ddlm[2], *p, *a, *a1, *a2;
  long locus, i;
  input = (char *) calloc (1, sizeof (char) * (LINESIZE + 1));
  isave = input;
  a = (char *) calloc (1, sizeof (char) * LINESIZE);
  a1 = (char *) calloc (1, sizeof (char) * LINESIZE);
  a2 = (char *) calloc (1, sizeof (char) * LINESIZE);
  dlm[0] = data->dlm, dlm[1] = '\0';
  ddlm[0] = ' ', ddlm[1] = '\0';
  fgets (input, LINESIZE, infile);
  if ((p = (char *) strchr (input, '\n')) != NULL)
    *p = '\0';
  for (locus = 0; locus < data->loci; locus++)
    {
      while (isspace ((int) *input))
	input++;
      if (input[0] == '\0')
	fgets (input, LINESIZE, infile);
      i = 0;
      while (input[i] != ' ' && input[i] != dlm[0])
	{
	  a1[i] = input[i];
	  i++;
	}
      a1[i] = '\0';
      input += i;
      i = 0;
      if (input[i] == dlm[0])
	{
	  input++;
	  while (input[i] != ' ' && input[i] != '\0')
	    {
	      a2[i] = input[i];
	      i++;
	    }
	  a2[i] = '\0';
	  if (a2[0] == '\0')
	    {
	      strcpy (a2, a1);
	    }
	  input += i;
	}
      else
	{
	  strcpy (a2, a1);
	}
      strcpy (data->yy[pop][ind][locus][0], a1);
      strcpy (data->yy[pop][ind][locus][1], a2);
    }
  free (a);
  free (a1);
  free (a2);
  free (isave);
}

void
read_alleles (FILE * infile, data_fmt * data, long pop, long ind)
{
  char *input, *p, *a;
  long locus, track = 0;
  a = (char *) calloc (1, sizeof (char) * LINESIZE);

  input = (char *) calloc (1, sizeof (char) * LINESIZE);
  fgets (input, LINESIZE, infile);
  if ((p = (char *) strchr (input, '\n')) != NULL)
    *p = '\0';
  for (locus = 0; locus < data->loci; locus++)
    {
      while (isspace ((int) *input))
	{
	  input++;
	  track++;
	}
      if (sscanf (input, "%s", a) == 1)
	{
	  input += strlen (a);
	  track += strlen (a);
	}

      data->yy[pop][ind][locus][0][0] = a[0];
      data->yy[pop][ind][locus][0][1] = '\0';
      if (a[1] == '\0')
	{
	  data->yy[pop][ind][locus][1][0] = a[0];
	  data->yy[pop][ind][locus][1][1] = '\0';
	}
      else
	{
	  data->yy[pop][ind][locus][1][0] = a[1];
	  data->yy[pop][ind][locus][1][1] = '\0';
	}
    }
  free (a);
  input -= track;
  free (input);
}

long
read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
	      long locus, long pop, long ind, long baseread)
{
  long j;
  char charstate;
  j = (options->interleaved) ? baseread : 0;
  charstate = getc (infile);
  ungetc ((int) charstate, infile);
  while (j < data->seq->sites[locus] && !(options->interleaved && charstate == '\n'))
    {
      charstate = getc (infile);
      if (charstate == '\n')
	{
	  if (options->interleaved)
	    return j;
	  else
	    charstate = ' ';
	}
      if (charstate == ' ' || (charstate >= '0' && charstate <= '9'))
	continue;
      charstate = uppercase (charstate);
      if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
	{
	  printf ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
		  charstate, j, ind, pop);
	  exit (EXIT_FAILURE);
	}
      data->yy[pop][ind][locus][0][j++] = charstate;
    }
  charstate = getc (infile);	/* swallow the \n */
  return j;
}


void
finish_read_seq (FILE * infile, data_fmt * data,
		 option_fmt * options, long pop, long baseread)
{

  long ind, baseread2 = 0, locus = 0;
  if (options->interleaved)
    {
      while (baseread < data->seq->sites[0])
	{
	  for (ind = 0; ind < data->numind[pop][FLOC]; ind++)
	    {
	      baseread2 = read_ind_seq (infile, data, options, locus, pop, ind, baseread);
	    }
	  baseread = baseread2;
	}
    }
  for (locus = 1; locus < data->loci; locus++)
    {
      baseread = 0;
      for (ind = 0; ind < data->numind[pop][FLOC]; ind++)
	{
	  read_indname (infile, data, pop, ind, options->nmlength);
	  baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
	}
      if (options->interleaved)
	{
	  while (baseread < data->seq->sites[locus])
	    {
	      for (ind = 0; ind < data->numind[pop][FLOC]; ind++)
		{
		  baseread2 = read_ind_seq (infile, data, options, locus, pop, ind, baseread);
		}
	      baseread = baseread2;
	    }
	}
    }
}

void
print_data_summary (FILE * file, world_fmt * world)
{
  long total = 0;
  long pop;
  fprintf (file, "\n\nSummary of data:\n");
  fprintf (file, "---------------\n");
  fprintf (file, "Datatype:                               %20s\n",
	   world->options->datatype == 'a' ?
	   "Allelic data" : (world->options->datatype == 's' ?
			     "Sequence data" : "Microsatellite data"));

  fprintf (file, "Number of loci:                         %20li\n\n", world->data->loci);

  fprintf (file, "Population                                       Individuals\n");
  fprintf (file, "------------------------------------------------------------\n");
  for (pop = 0; pop < world->numpop; pop++)
    {
      fprintf (file, "%3li %-50.50s%6li\n", pop + 1, world->data->popnames[pop], world->data->numind[pop][FLOC]);
      total += world->data->numind[pop][FLOC];
    }
  fprintf (file, "Total of all populations                              %6li\n\n", total);

}

void
print_data (world_fmt * world, data_fmt * data, option_fmt * options)
{
  if (options->printdata)
    {
      switch (options->datatype)
	{
	case 'a':
	case 'b':
	case 'm':
	  print_alleledata (world, data, options);
	  break;
	case 's':
	  print_seqdata (world, data);
	  break;
	}
    }
}

void
print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options)
{
  long i, pop, ind, locus, mult80;
  for (pop = 0; pop < data->numpop; pop++)
    {
      print_header (world->outfile, pop, world);
      for (ind = 0; ind < data->numind[pop][FLOC]; ind++)
	{
	  fprintf (world->outfile, "%-*.*s ",
		   (int) options->nmlength, (int) options->nmlength,
		   data->indnames[pop][ind]);
	  mult80 = options->nmlength;
	  for (locus = 0; locus < data->loci; locus++)
	    {
	      mult80 += 1 + strlen (data->yy[pop][ind][locus][0]) + strlen (data->yy[pop][ind][locus][1]);
	      if (mult80 >= 80)
		{
		  mult80 = 0;
		  fprintf (world->outfile, "\n");
		  for (i = 0; i < options->nmlength; i++)
		    fputc (' ', world->outfile);
		}
	      fprintf (world->outfile, " %s.%-s",
		       data->yy[pop][ind][locus][0],
		       data->yy[pop][ind][locus][1]);
	    }
	  fprintf (world->outfile, "\n");
	}
      fprintf (world->outfile, "\n");
    }
  fprintf (world->outfile, "\n\n");
}

void
print_seqdata (world_fmt * world, data_fmt * data)
{
  long pop, locus;
  for (pop = 0; pop < data->numpop; pop++)
    {
      print_header (world->outfile, pop, world);
      for (locus = 0; locus < data->loci; locus++)
	{
	  print_locus_head (locus, world);
	  print_seq_pop (locus, pop, world);
	}
    }
}

void
print_header (FILE * outfile, long pop, world_fmt * world)
{
  long i;
  long locus, mult80 = 80;
  char input[LINESIZE];
  fprintf (outfile, "\n%-s", world->data->popnames[pop]);
  for (i = 0; i < (long) (80 - strlen (world->data->popnames[pop])); i++)
    fputc ('-', outfile);
  fprintf (outfile, "\n\n");
  if (world->options->datatype != 's')
    {
      fprintf (outfile, "%-s  ", (world->data->loci == 1 ? "locus" : "loci "));
      for (i = 0; i < (world->options->nmlength - 6); i++)
	fputc (' ', outfile);
      for (locus = 0; locus < world->data->loci; locus++)
	{
	  if (locus * 4 + world->options->nmlength > mult80)
	    {
	      mult80 += 80;
	      fprintf (outfile, "\n");
	      for (i = 0; i < world->options->nmlength; i++)
		fputc (' ', outfile);
	    }
	  fprintf (outfile, "  %2li", locus + 1);
	}
      fprintf (outfile, "\n%-s\n", strncpy (input, "indiv.", world->options->nmlength));
    }
}



void
create_alleles (data_fmt * data)
{
  long locus, pop, ind;
  long z;
  char a1[DEFAULT_ALLELENMLENGTH];
  char a2[DEFAULT_ALLELENMLENGTH];
  for (locus = 0; locus < data->loci; locus++)
    {
      z = 0;
      for (pop = 0; pop < data->numpop; pop++)
	{
	  for (ind = 0; ind < data->numind[pop][locus]; ind++)
	    {
	      strcpy (a1, data->yy[pop][ind][locus][0]);
	      strcpy (a2, data->yy[pop][ind][locus][1]);
	      if (!strcmp (a1, a2))
		{
		  addAllele (data, a1, locus, &z);
		}
	      else
		{
		  addAllele (data, a1, locus, &z);
		  addAllele (data, a2, locus, &z);
		}
	    }
	}
      data->maxalleles[locus] = z + 1;
      /* + 1: for all the unencountered alleles */
    }
}

void
addAllele (data_fmt * data, char s[], long locus, long *z)
{
  long found = 0;
  while ((data->allele[locus][found++][0] != '\0')
	 && (strcmp (s, data->allele[locus][found - 1])));
  if (found > (*z))
    {
      strcpy (data->allele[locus][*z], s);
      (*z)++;
    }
}

void
set_numind (data_fmt * data)
{
  long locus, pop;
  for (locus = 1; locus < data->loci; locus++)
    {
      for (pop = 0; pop < data->numpop; pop++)
	{
	  data->numind[pop][locus] = data->numind[pop][FLOC];
	}
    }
}


void
print_seq_pop (long locus, long pop, world_fmt * world)
{
  long ind;
  for (ind = 0; ind < world->data->numind[pop][locus]; ind++)
    {
      print_seq_ind (locus, pop, ind, world);
    }
}

void
print_seq_ind (long locus, long pop, long ind, world_fmt * world)
{
  long site;
  char blank[2] = " ";
  fprintf (world->outfile, "%-*.*s", (int) world->options->nmlength,
	   (int) world->options->nmlength,
	   world->data->indnames[pop][ind]);
  fprintf (world->outfile, " %c",
	   world->data->yy[pop][ind][locus][0][0]);
  for (site = 1; site < world->data->seq->sites[locus]; site++)
    {
      if ((site) % 60 == 0)
	{
	  fprintf (world->outfile, "\n%-*.*s %c", (int) world->options->nmlength,
		   (int) world->options->nmlength,
		   blank,
		   world->data->yy[pop][ind][locus][0][site]);
	}
      else
	{
	  if ((site) % 10 == 0)
	    {
	      fprintf (world->outfile, " ");
	    }
	  fprintf (world->outfile, "%c",
		   world->data->yy[pop][ind][locus][0][site]);
	}
    }
  fprintf (world->outfile, "\n");
}


void
print_locus_head (long locus, world_fmt * world)
{
  char *head;
  head = (char *) calloc (1, sizeof (char) * MAX (10, world->options->nmlength));
  sprintf (head, "Locus %li", locus);
  fprintf (world->outfile, "%-*.*s --------10 --------20 --------30",
	   (int) world->options->nmlength,
	   (int) world->options->nmlength,
	   head);
  fprintf (world->outfile, " --------40 --------50 --------60\n");

  free (head);
}
