• API Main Page
  • Documentation
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

src/clustering.c

Go to the documentation of this file.
00001 /***************************************************************************
00002  *   Copyright (C) 2008 by Matthias Ihrke   *
00003  *   mihrke@uni-goettingen.de   *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU General Public License as published by  *
00007  *   the Free Software Foundation; either version 2 of the License, or     *
00008  *   (at your option) any later version.                                   *
00009  *                                                                         *
00010  *   This program is distributed in the hope that it will be useful,       *
00011  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00012  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00013  *   GNU General Public License for more details.                          *
00014  *                                                                         *
00015  *   You should have received a copy of the GNU General Public License     *
00016  *   along with this program; if not, write to the                         *
00017  *   Free Software Foundation, Inc.,                                       *
00018  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00019  ***************************************************************************/
00020 
00021 #include "clustering.h"
00022 #include "linalg.h"
00023 #include <time.h>
00024 #include "eeg.h"
00025 
00026 
00035 Clusters*    kmedoids_repeat( const Array *distmat, int K, int repeat ){
00036   Clusters *C;
00037   Clusters *Cnew;
00038   int i;
00039   double ratio, tmp;
00040   bool ismatrix;
00041   matrix_CHECKSQR( ismatrix, distmat );
00042   if( !ismatrix) return NULL;
00043   int N = distmat->size[0];
00044 
00045   long initial_seed=(long)time(NULL);
00046   OptArgList *opts=optarglist("seed=long", initial_seed );
00047 
00048   C = cluster_init( K, N );
00049   ratio=DBL_MAX;
00050   for( i=0; i<repeat; i++ ){
00051      optarglist_optarg_by_key( opts, "seed" )->data_scalar=initial_seed+i*10;
00052      Cnew = kmedoids( distmat, K, opts );
00053      tmp = 1.0;
00054      
00055      tmp =  cluster_within_scatter ( distmat, Cnew ); 
00056      tmp /= cluster_between_scatter( distmat, Cnew ); 
00057 
00058      if( tmp<ratio ){
00059         dprintf("Run %i: %f\n", i, tmp);
00060         ratio=tmp; 
00061         cluster_copy( C, Cnew );
00062      }
00063      cluster_free( Cnew );
00064   }
00065   optarglist_free( opts );
00066 
00067   return C;
00068 }
00069 
00079 Clusters*    kmedoids(const Array *distmat, int K, OptArgList *optargs ){
00080   Clusters *C, *Cnew;
00081   int *medoids;
00082   int i, j, k, r, minidx, num_not_changed;
00083   double sum, minsum, mindist, curdist;
00084   bool ismatrix;
00085   matrix_CHECKSQR( ismatrix, distmat );
00086   if( !ismatrix) return NULL;
00087   int N = distmat->size[0];
00088 
00089   /* optional arguments */
00090   unsigned long seed=0;
00091   double x;
00092   optarg_PARSE_SCALAR( optargs, "seed", seed, unsigned long, x );
00093 
00094   if( K>N ){
00095      errprintf("K>=N: %i>=%i\n", K, N );
00096      return NULL;
00097   }
00098 
00099   /* memory alloc */
00100   medoids = (int*) malloc( K*sizeof(int) );
00101   C    = cluster_init(K, N);
00102   Cnew = cluster_init(K, N);
00103 
00104   /* initialization step, random init */ 
00105   const gsl_rng_type * T;
00106   gsl_rng * randgen;
00107   gsl_rng_env_setup();
00108   T = gsl_rng_default;
00109   randgen = gsl_rng_alloc (T);
00110   if( seed==0 )
00111      seed=(unsigned long)time(NULL);
00112   gsl_rng_set (randgen, seed );
00113 
00114   Array *permut=array_new2( UINT, 1, K );
00115   for( i=0; i<K; i++ )
00116      array_INDEX1( permut, uint, i )=i;
00117   array_shuffle( permut, seed );
00118   for( i=0; i<N; i++ ){
00119      if( i<K ){ /* first each partition gets a guy */
00120         r = array_INDEX1( permut, uint, i );
00121      } else {
00122         r = gsl_rng_uniform_int( randgen, K );
00123      }
00124      C->clust[r][C->n[r]] = i;
00125      (C->n[r])++;
00126   }
00127   array_free( permut );
00128 
00129   dprintf("initialized cluster\n");  
00130 #ifdef DEBUG
00131   cluster_print(stderr, C); 
00132 #endif
00133 
00134   /*------------------ computation ----------------------------*/
00135   num_not_changed = 0;
00136   while(num_not_changed<1){ /* iterate until convergence */
00137      for( i=0; i<K; i++ ){ /* minimize distance to one of trials */
00138         minsum = DBL_MAX;
00139         minidx = 0;
00140         for( j=0; j<C->n[i]; j++ ){
00141           sum = 0;
00142           for( k=0; k<C->n[i]; k++){
00143              sum += mat_IDX( distmat, C->clust[i][j], C->clust[i][k] );
00144           }
00145           
00146           if( sum<minsum ){
00147              minsum = sum;
00148              minidx = C->clust[i][j];
00149           }
00150         }
00151         medoids[i] = minidx;
00152      }
00153 
00154      /* reset cluster */
00155      for( i=0; i<Cnew->K; i++ ){
00156         Cnew->n[i] = 0;
00157      }
00158      
00159      for( i=0; i<N; i++ ){ /* reassign clusters */
00160         mindist = DBL_MAX;
00161         minidx = 0;
00162         for( j=0; j<K; j++ ){ /* look for closest center */
00163           curdist = mat_IDX( distmat, medoids[j], i );
00164           if( curdist<mindist ){
00165              mindist=curdist;
00166              minidx=j;
00167           }
00168         }
00169         /*dprintf("Assigning trial %i to cluster %i\n", i, minidx);*/
00170         Cnew->clust[minidx][Cnew->n[minidx]] = i;
00171         Cnew->n[minidx] += 1;
00172      }
00173 
00174      if(!cluster_compare(C, Cnew))
00175         num_not_changed++;
00176      else
00177         cluster_copy(C, Cnew);
00178   }
00179   /*------------------ /computation ----------------------------*/
00180 
00181   free(medoids);
00182   cluster_free(Cnew);
00183   gsl_rng_free (randgen);
00184 
00185   return C;
00186 }
00187 
00193 int cluster_compare(const Clusters *c1, const Clusters *c2){
00194   int i, j, k, l, found, el;
00195   int c2i, prev_c2i;
00196 
00197   if( c1->K != c2->K ) return 1;
00198   /* clusters with equal number of trials? */
00199   for( i=0; i<c1->K; i++ ){
00200      found = 0;
00201      for( j=0; j<c2->K; j++ ){
00202         if(c1->n[i] == c2->n[j]){
00203           found = 1;
00204           break;
00205         }
00206      }
00207      if(!found)     
00208         return 1;
00209   }
00210 
00211   /* deep comparison */
00212   for( i=0; i<c1->K; i++ ){ /* each cluster from c1 */ 
00213      /* dprintf("i = %i\n", i); */
00214      prev_c2i = c1->K+1;
00215      c2i = 0;
00216      for( j=0; j<c1->n[i]; j++ ){ /* each element from c1 */
00217         el = c1->clust[i][j];
00218         /*      dprintf("j = %i\n", j);*/
00219         for( k=0; k<c2->K; k++ ){
00220           for( l=0; l<c2->n[k]; l++ ){
00221              if( el == c2->clust[k][l] ){
00222                 c2i = k;
00223                 if(prev_c2i==c1->K+1)
00224                   prev_c2i = k; /* first run */
00225              } 
00226           }
00227         }
00228         if( c2->n[c2i] != c1->n[i] ){
00229           /*          dprintf("ab: c2->n[%i]=%i, c1->n[%i]=%i\n", c2i, c2->n[c2i], i, c1->n[i]);*/
00230           return 1;
00231         }
00232         if(prev_c2i!=c2i){
00233           /*          dprintf("ab: prev_c2i=%i, c2i=%i\n", prev_c2i, c2i); */
00234           return 1;
00235         }
00236      }
00237   }
00238   return 0;
00239 }
00240 
00242 void cluster_copy(Clusters *dest, const Clusters *src){
00243   int i, j;
00244 
00245   dest->K = src->K;
00246   for( i=0; i<src->K; i++ ){
00247      dest->n[i] = src->n[i];
00248      for( j=0; j<src->n[i]; j++ ){
00249         dest->clust[i][j] = src->clust[i][j];
00250      }
00251   }
00252   
00253 }
00254 
00255 void cluster_free(Clusters *c){
00256   int i;
00257   free( c->n );
00258   for( i=0; i<c->K; i++)
00259      free(c->clust[i]);
00260   free(c->clust);
00261   free(c);
00262 }
00263 
00264 Clusters* cluster_init(int K, int maxN){
00265   Clusters *c;
00266   int i;
00267 
00268   c = (Clusters*) malloc( sizeof(Clusters) );
00269   c->K=K;
00270   c->n = (int*) malloc( K*sizeof(int) );
00271   c->clust = (int**) malloc( K*sizeof(int*) );
00272   for( i=0; i<K; i++ ){
00273      c->clust[i] = (int*) malloc( maxN*sizeof(int) );
00274      c->n[i] = 0;
00275   }
00276 
00277   return c;
00278 }
00279 
00280 void cluster_print(FILE *o,const Clusters *c){
00281   int i, j;
00282 
00283   fprintf(o, "Cluster with %i partitions\n", c->K);
00284   for( i=0; i<c->K; i++ ){
00285      fprintf(o, " C[%i]=\{ ", i+1);
00286      for( j=0; j<c->n[i]; j++ ){
00287         fprintf( o, "%i ", c->clust[i][j] );
00288      }
00289      fprintf( o, "}\n" );
00290   }
00291   fprintf( o, "\n" );
00292 }
00293 
00294 #if 0
00295 
00300 GapStatistic* gapstat_init( GapStatistic *g, int K, int B ){
00301   int i;
00302 
00303   if( g==ALLOC_IN_FCT ){
00304      g = (GapStatistic*)malloc( sizeof(GapStatistic) );
00305   }
00306   g->K = K;
00307   g->B = B;
00308   g->khat = -1;
00309   dprintf("allocating memory\n");
00310   g->gapdistr = (double*)malloc( K*sizeof(double) );
00311   g->sk = (double*)malloc( K*sizeof(double) );
00312   g->Wk= (double*)malloc( K*sizeof(double) );
00313   g->Wkref= (double**) malloc( B*sizeof(double*) );
00314   for( i=0; i<B; i++ )
00315      g->Wkref[i] = (double*) malloc( K*sizeof(double) );
00316   g->progress=NULL;
00317 
00318   return g;
00319 }
00320 
00321 void gapstat_free( GapStatistic *g ){
00322   int i;
00323   free( g->Wk );
00324   for( i=0; i<g->B; i++ )
00325      free( g->Wkref[i] );
00326   free( g->Wkref );
00327   free( g->gapdistr );
00328   free( g->sk );
00329   free( g );
00330 }
00331 
00339 void gapstat_calculate( GapStatistic *gap, double **X, int n, int p, 
00340                                 VectorDistanceFunction distfunction, const double** D ){
00341   int i, k, b;
00342   Clusters *C;
00343   double **Xr; /* reference distribution */
00344   double **Dr; /* distance in ref dist */
00345   double l_bar; /* (1/B sum_b( log(Wkref[b]) )) */
00346 
00347   /* init */
00348   Xr = (double**) malloc( n*sizeof( double* ) );
00349   Dr = (double**) malloc( n*sizeof( double* ) );
00350   for( i=0; i<n; i++ ){
00351      Xr[i] = (double*) malloc( p*sizeof( double ) );
00352      Dr[i] = (double*) malloc( n*sizeof( double ) );
00353   }
00354 
00355   if( gap->progress ){
00356      gap->progress( PROGRESSBAR_INIT, (gap->B+1)*gap->K );
00357   }
00358 
00359   /* calculation */
00360   for( k=1; k<=gap->K; k++ ){ /* data within-scatter */
00361      if( gap->progress )
00362         gap->progress( PROGRESSBAR_CONTINUE_LONG, k );
00363      C = kmedoids_repeat( D, n, k, 50 );
00364      //  cluster_print( stderr, C );
00365      gap->Wk[k-1] = get_within_scatter( D, n, C );
00366      cluster_free( C );
00367   }
00368 
00369   for( b=0; b<gap->B; b++ ){ /* monte Carlo for ref-data*/
00370      for( k=1; k<=gap->K; k++ ){
00371         if( gap->progress ){
00372           gap->progress( PROGRESSBAR_CONTINUE_LONG, (b+1)*gap->K+k );
00373         }
00374         
00375         Xr = gap_get_reference_distribution_simple( (const double **) X, n, p, Xr );
00376         Dr = vectordist_distmatrix( distfunction, (const double**)Xr, n, p, Dr, NULL, NULL );
00377         C = kmedoids_repeat( (const double**)Dr, n, k, 50 );
00378         //      cluster_print(stderr, C );
00379         gap->Wkref[b][k-1] = get_within_scatter( (const double**)Dr, n, C );
00380         cluster_free( C );
00381      }
00382   }
00383   
00384   for( k=1; k<=gap->K; k++ ){ /* gap distr */
00385      gap->gapdistr[k-1]=0;
00386      for( b=0; b<gap->B; b++ ){
00387         gap->gapdistr[k-1] += log( gap->Wkref[b][k-1] );
00388      }
00389      l_bar = ((gap->gapdistr[k-1])/(double)gap->B);
00390      gap->gapdistr[k-1] = l_bar - log( gap->Wk[k-1] );
00391 
00392      /* need std and sk */
00393      gap->sk[k-1] = 0;
00394      for( b=0; b<gap->B; b++ ){
00395         gap->sk[k-1] += SQR( log( gap->Wkref[b][k-1] ) - l_bar );
00396      }
00397      gap->sk[k-1] = sqrt( gap->sk[k-1]/(double)gap->B );
00398      gap->sk[k-1] = gap->sk[k-1] * sqrt( 1.0 + 1.0/(double)gap->B );
00399   }
00400   
00401   /* find best k */
00402   for( k=0; k<=gap->K-1; k++ ){
00403      if( gap->gapdistr[k] >= ((gap->gapdistr[k+1])-(gap->sk[k+1])) ){
00404         gap->khat = k+1;
00405         break;
00406      }
00407   }
00408   if( gap->progress )
00409      gap->progress( PROGRESSBAR_FINISH, 0 );
00410 
00411   /* free */
00412   for( i=0; i<n; i++ ){
00413      free(Xr[i]);
00414      free(Dr[i]);
00415   }
00416   free( Xr );
00417   free( Dr );
00418 }
00419 
00420 void gapstat_print( FILE *out, GapStatistic *g ){
00421   int i, j;
00422   double mean; 
00423 
00424   fprintf(stderr, "GapStatistic:\n"
00425              " K=%i\n"
00426              " B=%i\n"
00427              " khat=%i\n", g->K, g->B, g->khat );
00428   fprintf(stderr, " Gap       = [ ");
00429   for(i=0; i<g->K; i++ )
00430      fprintf(stderr, "(%i, %.2f) ", i+1, g->gapdistr[i]);
00431   fprintf(stderr, "]\n");
00432 
00433   fprintf(stderr, " sk        = [ ");
00434   for(i=0; i<g->K; i++ )
00435      fprintf(stderr, "(%i, %.4f) ", i+1, g->sk[i]);
00436   fprintf(stderr, "]\n");
00437 
00438   fprintf(stderr, " Wk        = [ ");
00439   for(i=0; i<g->K; i++ )
00440      fprintf(stderr, "(%i, %.4f) ", i+1, g->Wk[i]);
00441   fprintf(stderr, "]\n");
00442 
00443   fprintf(stderr, " <Wkref>_B = [ ");
00444   for(i=0; i<g->K; i++ ){
00445      mean = 0;
00446      for(j=0; j<g->B; j++){
00447         mean += g->Wkref[j][i];
00448      }
00449      mean /= (double)g->B;
00450      fprintf(stderr, "(%i, %.4f) ", i+1, mean);
00451   }
00452   fprintf(stderr, "]\n");
00453 
00454 }
00455 
00461 double** gap_get_reference_distribution_simple( const double **X, int n, int p, double **Xr ){
00462   int i, j;
00463   double minf, maxf;
00464 
00465   for( i=0; i<p; i++ ){ /* features */
00466      minf=DBL_MAX;
00467      maxf=DBL_MIN;
00468      for( j=0; j<n; j++ ){        /* find min/max for feature i */
00469         if( X[j][i]<minf )
00470           minf = X[j][i];
00471         if( X[j][i]>maxf )
00472           maxf = X[j][i];
00473      }
00474      
00475      for( j=0; j<n; j++ ){ /* uniform random values in [minf,maxf] */
00476         Xr[j][i] = (drand48()*maxf)+minf;
00477         if( isnan(Xr[j][i]) ){
00478           errprintf(" Xr[%i][%i] is nan\n", j, i);
00479         }
00480      }
00481   }
00482 
00483   return Xr;
00484 }
00485 
00493 double** gap_get_reference_distribution_svd( const double **X, int n, int p, double **Xr ){
00494   int i, j;
00495   double minf, maxf;
00496   gsl_matrix *gX, *gXd, *V;
00497   gsl_vector *work, *S;
00498 
00499   /* convert to gsl */
00500   gX = gsl_matrix_alloc( n, p );
00501   gXd= gsl_matrix_alloc( n, p );
00502   V  = gsl_matrix_alloc( n, p );
00503   S  = gsl_vector_alloc( p );
00504   work=gsl_vector_alloc( p );
00505 
00506   for( i=0; i<n; i++ ){
00507      for( j=0; j<p; j++ ){
00508         gsl_matrix_set( gX, i, j, X[i][j] );
00509      }
00510   }
00511   gsl_matrix_memcpy( gXd, gX );
00512 
00513   gsl_linalg_SV_decomp( gXd, V, S, work);
00514 
00515   gsl_blas_dgemm(CblasNoTrans, /* matrix mult */
00516                       CblasNoTrans,
00517                       1.0, gX, V, 0.0, gXd);
00518 
00519   for( i=0; i<p; i++ ){ /* features */
00520      minf=DBL_MAX;
00521      maxf=DBL_MIN;
00522      for( j=0; j<n; j++ ){        /* find min/max for feature i */
00523         if( gsl_matrix_get( gXd, j, i )<minf )
00524           minf = gsl_matrix_get( gXd, j, i );
00525         if( gsl_matrix_get( gXd, j, i )>maxf )
00526           maxf = gsl_matrix_get( gXd, j, i );
00527      }
00528      
00529      for( j=0; j<n; j++ ){ /* uniform random values in [minf,maxf] */
00530         gsl_matrix_set( gX, j, i, ((drand48()*maxf)+minf) );
00531      }
00532   }
00533 
00534   gsl_matrix_transpose_memcpy( gXd, V );
00535   gsl_blas_dgemm(CblasNoTrans,
00536                       CblasNoTrans,
00537                       1.0, gX, gXd, 0.0, V);
00538 
00539   for( i=0; i<n; i++ ){
00540      for( j=0; j<p; j++ ){
00541         Xr[j][i] = gsl_matrix_get( V, i, j );
00542      }
00543   }
00544 
00545   gsl_matrix_free( gX );
00546   gsl_matrix_free( gXd );
00547   gsl_matrix_free( V );
00548   gsl_vector_free( work );
00549   gsl_vector_free( S );
00550 
00551   return Xr;
00552 }
00553 
00554 #endif
00555 
00567 double   cluster_within_scatter (const Array *distmat, const Clusters *c){
00568   double *sums;
00569   double W;
00570   int i, j, k; 
00571   bool ismatrix;
00572   matrix_CHECKSQR( ismatrix, distmat );
00573   if( !ismatrix) return -1;
00574 
00575   //  cluster_print( stderr, c);
00576   sums = (double*) malloc( c->K*sizeof(double) );
00577   W = 0;
00578   int numcomp=0;
00579   for( i=0; i<c->K; i++ ){ /* cluster loop */
00580      sums[i] = 0;
00581      for( j=0; j<(c->n[i])-1; j++ ){
00582         for( k=j+1; k<c->n[i]; k++){     
00583           sums[i] += mat_IDX( distmat, c->clust[i][j], c->clust[i][k]);
00584           numcomp++;
00585         }
00586      }
00587      //  sums[i] /= (double)2*c->n[i];
00588      /* dprintf("sums[i]=%f\n", sums[i]); */
00589      W += sums[i];
00590   }
00591   W /= (double)2*numcomp;
00592 
00593   free(sums);
00594 
00595   return W;
00596 }
00597 
00604 double   cluster_between_scatter(const Array *distmat, const Clusters *c){
00605   double W;
00606   int k1, k2, i, j;
00607   int num_comp; 
00608   bool ismatrix;
00609   matrix_CHECKSQR( ismatrix, distmat );
00610   if( !ismatrix) return -1;
00611 
00612   W = 0.0; 
00613   num_comp=0;
00614   for( k1=0; k1<c->K-1; k1++ ){
00615      for( k2=k1+1; k2<c->K; k2++ ){ /* compare cluster k1 and k2 */
00616         for( i=0; i<c->n[k1]; i++ ){
00617           for( j=0; j<c->n[k2]; j++ ){
00618              W += mat_IDX( distmat, c->clust[k1][i], c->clust[k2][j] );
00619              num_comp++;
00620           }
00621         }
00622      }
00623   }
00624   W /= (double)2*num_comp;
00625 
00626   return W;
00627 }
00628 
00629 
00640 Dendrogram*  agglomerative_clustering(const Array *distmat, LinkageFunction distfct){
00641   Dendrogram **nodes, *tmp; /* at the lowest level, we have N nodes */
00642   double min_d, cur_d=0.0;
00643   int min_i=0, min_j=0;
00644   int num_nodes;
00645   int i, j;
00646   bool ismatrix;
00647   matrix_CHECKSQR( ismatrix, distmat );
00648   if( !ismatrix) return NULL;
00649   int N=distmat->size[1];
00650 
00651   nodes = (Dendrogram**) malloc( N*sizeof( Dendrogram* ) );
00652 
00653   int clustidx=0;
00654   for( i=0; i<N; i++ ){ /* initialize terminal nodes */
00655      nodes[i] = dgram_init( i, NULL, NULL );
00656      nodes[i]->clustnum=clustidx++;
00657   }
00658   num_nodes = N;
00659 
00660   while( num_nodes > 1 ){
00661      dprintf("nmodes = %i\n", num_nodes );
00662      min_d = DBL_MAX;
00663      for( i=0; i<num_nodes-1; i++ ){
00664         for( j=i+1; j<num_nodes; j++ ){ 
00665           cur_d = distfct( distmat, nodes[i], nodes[j] );
00666           if( cur_d<=min_d ){
00667              min_d=cur_d;
00668              min_i=i;
00669              min_j=j;
00670           }
00671         }
00672      }
00673      
00674      /* we know min_i and min_j, now */
00675      tmp = dgram_init( -1, nodes[min_i], nodes[min_j] ); /* link the two nodes */
00676      tmp->height=min_d;
00677      tmp->clustnum=clustidx++;
00678      nodes[min_i] = tmp; /* remove min_i */
00679      tmp = nodes[num_nodes-1]; /* swap with last entry in current list */
00680      nodes[min_j] = tmp;
00681      num_nodes--;
00682      dprintf("Linking (%i,%i), height=%f\n", min_i, min_j, min_d );
00683   }
00684   tmp = nodes[0]; /* this is the root-node now */
00685   free( nodes );
00686   return tmp;
00687 }
00688 
00690 void _dgram_walk_matlab( const Dendrogram *t, Array *a ){
00691   if( t->val>=0 ){ /* leave */
00692      return;
00693   }
00694   mat_IDX( a, t->clustnum, 0 ) = t->left->clustnum+1;
00695   mat_IDX( a, t->clustnum, 1 ) = t->right->clustnum+1;
00696   mat_IDX( a, t->clustnum, 2 ) = t->height; 
00697   _dgram_walk_matlab( t->left, a );
00698   _dgram_walk_matlab( t->right, a );
00699 }
00740 Array* dgram_to_matlab( const Dendrogram *dgram ){
00741   int N=dgram_num_leaves( dgram );
00742   Array *mat=array_new2( DOUBLE, 2, N+N-1, 3 );
00743   _dgram_walk_matlab( dgram, mat );
00744   char buf[200];
00745   sprintf( buf, "%i-%i,:", N, N+N-2 );
00746   Array *rmat=array_slice( mat, buf );
00747   array_free(mat);
00748   return rmat;
00749 }
00750 
00754 Dendrogram* dgram_init(int val, Dendrogram *left, Dendrogram *right ){
00755   Dendrogram *c;
00756   c = (Dendrogram*) malloc( sizeof(Dendrogram) );
00757   c->val = val;
00758   c->height=-1.0;
00759   c->clustnum=0;
00760   c->left = left;
00761   c->right= right;
00762   return c;
00763 }
00764 
00765 
00768 void dgram_free(Dendrogram *t){
00769   if( t->right==NULL && t->left==NULL ){ /* base case */
00770      free( t );
00771      return;
00772   } else { /* recurse into sub-trees */
00773      if( t->right!=NULL ){
00774         dgram_free( t->right );
00775      } 
00776      if( t->left!=NULL ){
00777         dgram_free( t->left );
00778      }
00779   }
00780 }
00782 void _dgram_preorder_recursive( const Dendrogram *t, int *vals, int *n ){
00783   if( t->left==NULL && t->right==NULL ){
00784      if( t->val<0 ){
00785         errprintf("t->val<0 (%i) even though it's a terminal node\n", t->val);
00786         return;
00787      }
00788      vals[(*n)++] = t->val;
00789      return;
00790   } 
00791   if( t->left!=NULL ){
00792      //  dprintf("walking  left subtree\n");
00793      _dgram_preorder_recursive( t->left, vals, n );
00794   }
00795   if( t->right!=NULL ){
00796      _dgram_preorder_recursive( t->right, vals, n );
00797   }
00798   return;
00799 }
00810 void         dgram_preorder( const Dendrogram *t, int *vals, int *n ){
00811   int m;
00812 
00813   m = 0;
00814   //  dprintf(" starting recursive walk, m=%i\n", m);
00815   //  dgram_print_node(t);
00816   _dgram_preorder_recursive( t, vals, &m );
00817 
00818   /* fprintf(stderr, "vals=["); */
00819   /* for( i=0; i<m; i++ ){ */
00820   /*     fprintf(stderr, "%i ", vals[i]); */
00821   /* } */
00822   /* fprintf(stderr, "]\n"); */
00823   //  dprintf(" done with recursive walk, m=%i\n", m);
00824   *n = m;
00825 
00826   return;
00827 }
00833 double dgram_dist_singlelinkage  (const Array *d, const Dendrogram *c1, const Dendrogram *c2){
00834   double dist;
00835   int *el1, *el2;
00836   int n1, n2;
00837   int i, j;
00838   bool ismatrix;
00839   matrix_CHECK( ismatrix,d );
00840   if( !ismatrix ) return -1;
00841   if( d->size[0]!=d->size[1] ){  
00842      errprintf("distmat must be N x N, got (%i,%i)\n",d->size[0],d->size[1]);
00843      return -1; 
00844   }
00845   int N = d->size[0];
00846 
00847   dist = DBL_MAX;
00848   
00849   el1 = (int*) malloc( N*sizeof( int ) ); /* N suffices */
00850   el2 = (int*) malloc( N*sizeof( int ) ); 
00851 
00852   dgram_preorder( c1, el1, &n1 );
00853   dgram_preorder( c2, el2, &n2 );
00854 
00855   for( i=0; i<n1; i++ ){
00856      for( j=0; j<n2; j++ ){
00857         if( el1[i]<0 || el1[i]>=N || el2[j]<0 || el2[j]>=N ){
00858           errprintf("something's wrong, el[i] not in range 0-%i\n", N);
00859           return -1;
00860         }
00861         if( mat_IDX( d, el1[i], el2[j]) < dist ){
00862           dist = mat_IDX( d, el1[i], el2[j]);
00863         }
00864      }
00865   }
00866   
00867   //dprintf("SL-distance = %f\n", dist);
00868 
00869   free(el1);
00870   free(el2);
00871   return dist;
00872 }
00873 
00879 double dgram_dist_completelinkage(const Array *d, const Dendrogram *c1, const Dendrogram *c2){
00880   double dist;
00881   int *el1, *el2;
00882   int n1, n2;
00883   int i, j;
00884   bool ismatrix;
00885   matrix_CHECK( ismatrix,d );
00886   if( !ismatrix ) return -1; 
00887   if( d->size[0]!=d->size[1] ){  
00888      errprintf("distmat must be N x N, got (%i,%i)\n",d->size[0],d->size[1]);
00889      return -1; 
00890   }
00891   int N = d->size[0];
00892 
00893   dist = DBL_MIN;
00894   
00895   el1 = (int*) malloc( N*sizeof( int ) ); /* N suffices */
00896   el2 = (int*) malloc( N*sizeof( int ) ); 
00897 
00898   dgram_preorder( c1, el1, &n1 );
00899   dgram_preorder( c2, el2, &n2 );
00900 
00901   for( i=0; i<n1; i++ ){
00902      for( j=0; j<n2; j++ ){
00903         if( el1[i]<0 || el1[i]>=N || el2[j]<0 || el2[j]>=N ){
00904           errprintf("something's wrong, el[i] not in range 0-%i\n", N);
00905           return -1;
00906         }
00907         if( mat_IDX( d, el1[i], el2[j] )>dist ){
00908           dist = mat_IDX( d, el1[i], el2[j]);
00909         }
00910      }
00911   }
00912   
00913   //  dprintf("SL-distance = %f\n", dist);
00914 
00915   free(el1);
00916   free(el2);
00917   return dist;
00918 }
00919 
00920 
00922 #define END_NODE 0
00923 #define INTERMEDIATE_NODE 1
00924 #define NULL_NODE 2
00925 int _dgram_get_deepest_recursive( Dendrogram *c, Dendrogram **candidate, int *depth, int curdepth ){
00926   int status_left=NULL_NODE;
00927   int status_right=NULL_NODE;
00928 
00929   if( c->left==NULL && c->right==NULL ){
00930      return END_NODE;
00931   }
00932 
00933   if( c->left!=NULL ){
00934      status_left  = _dgram_get_deepest_recursive( c->left,  candidate, depth, curdepth+1 );  
00935   }
00936   if( c->right!=NULL ){
00937      status_right = _dgram_get_deepest_recursive( c->right, candidate, depth, curdepth+1 );  
00938   }
00939 
00940   if( status_left==END_NODE && status_right==END_NODE ){
00941 
00942      if( curdepth>=*depth ){
00943         *candidate = c;
00944         *depth = curdepth;
00945      }
00946      dprintf("END_NODE visited, cd=%i, d=%i, c=%p, candidate=%p\n", curdepth, *depth, c, candidate);
00947   }
00948 
00949   return INTERMEDIATE_NODE;
00950 }
00956 Dendrogram* dgram_get_deepest( Dendrogram *c ){
00957   Dendrogram **d, *tmp;
00958   int depth;
00959   d = (Dendrogram **) malloc( sizeof( Dendrogram* ) );
00960 
00961   depth = 0;
00962   _dgram_get_deepest_recursive( c, d, &depth, 0 );
00963   dprintf("deepest=%p, depth=%i\n", *d, depth);
00964   tmp = *d;
00965   free(d);
00966   return tmp;
00967 }
00968 
00969 
00971 void _dgram_print_recursive( Dendrogram *t, int level ){
00972   int i;
00973 
00974   fprintf( stdout, "'%p(%4i,%.2f,%4i)' - ", t, t->clustnum, t->height, t->val );
00975   if( t->left ){ /* there is a left tree, print it */
00976      _dgram_print_recursive( t->left, level+1 );
00977   }  
00978   /* we need to add spaces */
00979   fprintf( stdout, "\n" );
00980   for( i=0; i<level; i++)
00981      fprintf( stdout, "           " );
00982 
00983   if( t->right ){
00984      _dgram_print_recursive( t->right, level+1 );
00985   }
00986 }
00989 void         dgram_print( Dendrogram *t ){
00990   _dgram_print_recursive( t, 0 );
00991   fprintf( stdout, "\n");
00992 }
00993 
00996 int dgram_num_leaves( const Dendrogram *t ){
00997   int r=0;
00998   if( t==NULL ){
00999      return 0;
01000   } else if( t->val!=-1 ){
01001      r=1;
01002   }
01003   r += dgram_num_leaves( t->left );
01004   r += dgram_num_leaves( t->right );
01005   return r;
01006 }
01007 
01008 void dgram_print_node( Dendrogram *t ){
01009   fprintf( stdout, "t=%p, val=%i, height=%f, left=%p, right=%p\n", t, t->val, t->height, t->left, t->right );
01010 }
01011 #undef END_NODE 
01012 #undef INTERMEDIATE_NODE 
01013 #undef NULL_NODE
01014 
01015 #if 0
01016 
01026 int      eeg_best_num_clusters_gapstat( const EEG *eeg, VectorDistanceFunction distfunction, OptArgList *optargs ){
01027   GapStatistic *gap;
01028   double x;
01029   int max_num_clusters;
01030   int num_ref_dist;
01031   EEG *meaneeg;
01032   double **D;
01033   int bestclust=0;
01034 
01035   max_num_clusters = 10;
01036   num_ref_dist = eeg->ntrials;
01037   if( optarglist_has_key( optargs, "max_num_clusters" ) ){
01038      x = optarglist_scalar_by_key( optargs, "max_num_clusters" );
01039      if( !isnan( x ) ) max_num_clusters=(int)x;
01040   }
01041   if( optarglist_has_key( optargs, "num_ref_dist" ) ){
01042      x = optarglist_scalar_by_key( optargs, "num_ref_dist" );
01043      if( !isnan( x ) ) num_ref_dist=(int)x;
01044   }
01045 
01046   meaneeg = eeg_average_channels( eeg );
01047   eeg_print( stderr, meaneeg, 3 );
01048   D = eeg_distmatrix( meaneeg, distfunction, ALLOC_IN_FCT, optargs );
01049 
01050   gap = gapstat_init( NULL, max_num_clusters, num_ref_dist );
01051   gapstat_calculate( gap, meaneeg->data[0], meaneeg->ntrials, meaneeg->n, 
01052                             distfunction, (const double**)D );
01053 
01054   bestclust = gap->khat;
01055 #ifdef DEBUG
01056   gapstat_print( stderr, gap );
01057 #endif 
01058 
01059   dblpp_free( D, meaneeg->ntrials );
01060   eeg_free( meaneeg );
01061   gapstat_free( gap );
01062 
01063   return bestclust;
01064 }
01065 #endif

Generated on Fri Jun 25 2010 14:10:16 for libeegtools by  doxygen 1.7.0