00001 //---------------------------------------------------------------------- 00002 // File: rand.cpp 00003 // Programmer: Sunil Arya and David Mount 00004 // Description: Routines for random point generation 00005 // Last modified: 08/04/06 (Version 1.1.1) 00006 //---------------------------------------------------------------------- 00007 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 00008 // David Mount. All Rights Reserved. 00009 // 00010 // This software and related documentation is part of the Approximate 00011 // Nearest Neighbor Library (ANN). This software is provided under 00012 // the provisions of the Lesser GNU Public License (LGPL). See the 00013 // file ../ReadMe.txt for further information. 00014 // 00015 // The University of Maryland (U.M.) and the authors make no 00016 // representations about the suitability or fitness of this software for 00017 // any purpose. It is provided "as is" without express or implied 00018 // warranty. 00019 //---------------------------------------------------------------------- 00020 // History: 00021 // Revision 0.1 03/04/98 00022 // Initial release 00023 // Revision 0.2 03/26/98 00024 // Changed random/srandom declarations for SGI's. 00025 // Revision 1.0 04/01/05 00026 // annClusGauss centers distributed over [-1,1] rather than [0,1] 00027 // Added annClusOrthFlats distribution 00028 // Changed procedure names to avoid namespace conflicts 00029 // Added annClusFlats distribution 00030 // Added rand/srand option and fixed annRan0() initialization. 00031 // Revision 1.1.1 08/04/06 00032 // Added planted distribution 00033 //---------------------------------------------------------------------- 00034 00035 #include "rand.h" // random generator declarations 00036 00037 using namespace std; // make std:: accessible 00038 00039 //---------------------------------------------------------------------- 00040 // Globals 00041 //---------------------------------------------------------------------- 00042 int annIdum = 0; // used for random number generation 00043 00044 //------------------------------------------------------------------------ 00045 // annRan0 - (safer) uniform random number generator 00046 // 00047 // The code given here is taken from "Numerical Recipes in C" by 00048 // William Press, Brian Flannery, Saul Teukolsky, and William 00049 // Vetterling. The task of the code is to do an additional randomizing 00050 // shuffle on the system-supplied random number generator to make it 00051 // safer to use. 00052 // 00053 // Returns a uniform deviate between 0.0 and 1.0 using the 00054 // system-supplied routine random() or rand(). Set the global 00055 // annIdum to any negative value to initialise or reinitialise 00056 // the sequence. 00057 //------------------------------------------------------------------------ 00058 00059 double annRan0() 00060 { 00061 const int TAB_SIZE = 97; // table size: any large number 00062 int j; 00063 00064 static double y, v[TAB_SIZE]; 00065 static int iff = 0; 00066 const double RAN_DIVISOR = double(ANN_RAND_MAX + 1UL); 00067 if (RAN_DIVISOR < 0) { 00068 cout << "RAN_DIVISOR " << RAN_DIVISOR << endl; 00069 exit(0); 00070 } 00071 00072 //-------------------------------------------------------------------- 00073 // As a precaution against misuse, we will always initialize on the 00074 // first call, even if "annIdum" is not set negative. Determine 00075 // "maxran", the next integer after the largest representable value 00076 // of type int. We assume this is a factor of 2 smaller than the 00077 // corresponding value of type unsigned int. 00078 //-------------------------------------------------------------------- 00079 00080 if (annIdum < 0 || iff == 0) { // initialize 00081 iff = 1; 00082 ANN_SRAND(annIdum); // (re)seed the generator 00083 annIdum = 1; 00084 00085 for (j = 0; j < TAB_SIZE; j++) // exercise the system routine 00086 ANN_RAND(); // (values intentionally ignored) 00087 00088 for (j = 0; j < TAB_SIZE; j++) // then save TAB_SIZE-1 values 00089 v[j] = ANN_RAND(); 00090 y = ANN_RAND(); // generate starting value 00091 } 00092 00093 //-------------------------------------------------------------------- 00094 // This is where we start if not initializing. Use the previously 00095 // saved random number y to get an index j between 1 and TAB_SIZE-1. 00096 // Then use the corresponding v[j] for both the next j and as the 00097 // output number. 00098 //-------------------------------------------------------------------- 00099 00100 j = int(TAB_SIZE * (y / RAN_DIVISOR)); 00101 y = v[j]; 00102 v[j] = ANN_RAND(); // refill the table entry 00103 return y / RAN_DIVISOR; 00104 } 00105 00106 //------------------------------------------------------------------------ 00107 // annRanInt - generate a random integer from {0,1,...,n-1} 00108 // 00109 // If n == 0, then -1 is returned. 00110 //------------------------------------------------------------------------ 00111 00112 static int annRanInt( 00113 int n) 00114 { 00115 int r = (int) (annRan0()*n); 00116 if (r == n) r--; // (in case annRan0() == 1 or n == 0) 00117 return r; 00118 } 00119 00120 //------------------------------------------------------------------------ 00121 // annRanUnif - generate a random uniform in [lo,hi] 00122 //------------------------------------------------------------------------ 00123 00124 static double annRanUnif( 00125 double lo, 00126 double hi) 00127 { 00128 return annRan0()*(hi-lo) + lo; 00129 } 00130 00131 //------------------------------------------------------------------------ 00132 // annRanGauss - Gaussian random number generator 00133 // Returns a normally distributed deviate with zero mean and unit 00134 // variance, using annRan0() as the source of uniform deviates. 00135 //------------------------------------------------------------------------ 00136 00137 static double annRanGauss() 00138 { 00139 static int iset=0; 00140 static double gset; 00141 00142 if (iset == 0) { // we don't have a deviate handy 00143 double v1, v2; 00144 double r = 2.0; 00145 while (r >= 1.0) { 00146 //------------------------------------------------------------ 00147 // Pick two uniform numbers in the square extending from -1 to 00148 // +1 in each direction, see if they are in the circle of radius 00149 // 1. If not, try again 00150 //------------------------------------------------------------ 00151 v1 = annRanUnif(-1, 1); 00152 v2 = annRanUnif(-1, 1); 00153 r = v1 * v1 + v2 * v2; 00154 } 00155 double fac = sqrt(-2.0 * log(r) / r); 00156 //----------------------------------------------------------------- 00157 // Now make the Box-Muller transformation to get two normal 00158 // deviates. Return one and save the other for next time. 00159 //----------------------------------------------------------------- 00160 gset = v1 * fac; 00161 iset = 1; // set flag 00162 return v2 * fac; 00163 } 00164 else { // we have an extra deviate handy 00165 iset = 0; // so unset the flag 00166 return gset; // and return it 00167 } 00168 } 00169 00170 //------------------------------------------------------------------------ 00171 // annRanLaplace - Laplacian random number generator 00172 // Returns a Laplacian distributed deviate with zero mean and 00173 // unit variance, using annRan0() as the source of uniform deviates. 00174 // 00175 // prob(x) = b/2 * exp(-b * |x|). 00176 // 00177 // b is chosen to be sqrt(2.0) so that the variance of the Laplacian 00178 // distribution [2/(b^2)] becomes 1. 00179 //------------------------------------------------------------------------ 00180 00181 static double annRanLaplace() 00182 { 00183 const double b = 1.4142136; 00184 00185 double laprand = -log(annRan0()) / b; 00186 double sign = annRan0(); 00187 if (sign < 0.5) laprand = -laprand; 00188 return(laprand); 00189 } 00190 00191 //---------------------------------------------------------------------- 00192 // annUniformPts - Generate uniformly distributed points 00193 // A uniform distribution over [-1,1]. 00194 //---------------------------------------------------------------------- 00195 00196 void annUniformPts( // uniform distribution 00197 ANNpointArray pa, // point array (modified) 00198 int n, // number of points 00199 int dim) // dimension 00200 { 00201 for (int i = 0; i < n; i++) { 00202 for (int d = 0; d < dim; d++) { 00203 pa[i][d] = (ANNcoord) (annRanUnif(-1,1)); 00204 } 00205 } 00206 } 00207 00208 //---------------------------------------------------------------------- 00209 // annGaussPts - Generate Gaussian distributed points 00210 // A Gaussian distribution with zero mean and the given standard 00211 // deviation. 00212 //---------------------------------------------------------------------- 00213 00214 void annGaussPts( // Gaussian distribution 00215 ANNpointArray pa, // point array (modified) 00216 int n, // number of points 00217 int dim, // dimension 00218 double std_dev) // standard deviation 00219 { 00220 for (int i = 0; i < n; i++) { 00221 for (int d = 0; d < dim; d++) { 00222 pa[i][d] = (ANNcoord) (annRanGauss() * std_dev); 00223 } 00224 } 00225 } 00226 00227 //---------------------------------------------------------------------- 00228 // annLaplacePts - Generate Laplacian distributed points 00229 // Generates a Laplacian distribution (zero mean and unit variance). 00230 //---------------------------------------------------------------------- 00231 00232 void annLaplacePts( // Laplacian distribution 00233 ANNpointArray pa, // point array (modified) 00234 int n, // number of points 00235 int dim) // dimension 00236 { 00237 for (int i = 0; i < n; i++) { 00238 for (int d = 0; d < dim; d++) { 00239 pa[i][d] = (ANNcoord) annRanLaplace(); 00240 } 00241 } 00242 } 00243 00244 //---------------------------------------------------------------------- 00245 // annCoGaussPts - Generate correlated Gaussian distributed points 00246 // Generates a Gauss-Markov distribution of zero mean and unit 00247 // variance. 00248 //---------------------------------------------------------------------- 00249 00250 void annCoGaussPts( // correlated-Gaussian distribution 00251 ANNpointArray pa, // point array (modified) 00252 int n, // number of points 00253 int dim, // dimension 00254 double correlation) // correlation 00255 { 00256 double std_dev_w = sqrt(1.0 - correlation * correlation); 00257 for (int i = 0; i < n; i++) { 00258 double previous = annRanGauss(); 00259 pa[i][0] = (ANNcoord) previous; 00260 for (int d = 1; d < dim; d++) { 00261 previous = correlation*previous + std_dev_w*annRanGauss(); 00262 pa[i][d] = (ANNcoord) previous; 00263 } 00264 } 00265 } 00266 00267 //---------------------------------------------------------------------- 00268 // annCoLaplacePts - Generate correlated Laplacian distributed points 00269 // Generates a Laplacian-Markov distribution of zero mean and unit 00270 // variance. 00271 //---------------------------------------------------------------------- 00272 00273 void annCoLaplacePts( // correlated-Laplacian distribution 00274 ANNpointArray pa, // point array (modified) 00275 int n, // number of points 00276 int dim, // dimension 00277 double correlation) // correlation 00278 { 00279 double wn; 00280 double corr_sq = correlation * correlation; 00281 00282 for (int i = 0; i < n; i++) { 00283 double previous = annRanLaplace(); 00284 pa[i][0] = (ANNcoord) previous; 00285 for (int d = 1; d < dim; d++) { 00286 double temp = annRan0(); 00287 if (temp < corr_sq) 00288 wn = 0.0; 00289 else 00290 wn = annRanLaplace(); 00291 previous = correlation * previous + wn; 00292 pa[i][d] = (ANNcoord) previous; 00293 } 00294 } 00295 } 00296 00297 //---------------------------------------------------------------------- 00298 // annClusGaussPts - Generate clusters of Gaussian distributed points 00299 // Cluster centers are uniformly distributed over [-1,1], and the 00300 // standard deviation within each cluster is fixed. 00301 // 00302 // Note: Once cluster centers have been set, they are not changed, 00303 // unless new_clust = true. This is so that subsequent calls generate 00304 // points from the same distribution. It follows, of course, that any 00305 // attempt to change the dimension or number of clusters without 00306 // generating new clusters is asking for trouble. 00307 // 00308 // Note: Cluster centers are not generated by a call to uniformPts(). 00309 // Although this could be done, it has been omitted for 00310 // compatibility with annClusGaussPts() in the colored version, 00311 // rand_c.cc. 00312 //---------------------------------------------------------------------- 00313 00314 void annClusGaussPts( // clustered-Gaussian distribution 00315 ANNpointArray pa, // point array (modified) 00316 int n, // number of points 00317 int dim, // dimension 00318 int n_clus, // number of colors 00319 ANNbool new_clust, // generate new clusters. 00320 double std_dev) // standard deviation within clusters 00321 { 00322 static ANNpointArray clusters = NULL;// cluster storage 00323 00324 if (clusters == NULL || new_clust) {// need new cluster centers 00325 if (clusters != NULL) // clusters already exist 00326 annDeallocPts(clusters); // get rid of them 00327 clusters = annAllocPts(n_clus, dim); 00328 // generate cluster center coords 00329 for (int i = 0; i < n_clus; i++) { 00330 for (int d = 0; d < dim; d++) { 00331 clusters[i][d] = (ANNcoord) annRanUnif(-1,1); 00332 } 00333 } 00334 } 00335 00336 for (int i = 0; i < n; i++) { 00337 int c = annRanInt(n_clus); // generate cluster index 00338 for (int d = 0; d < dim; d++) { 00339 pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + clusters[c][d]); 00340 } 00341 } 00342 } 00343 00344 //---------------------------------------------------------------------- 00345 // annClusOrthFlats - points clustered along orthogonal flats 00346 // 00347 // This distribution consists of a collection points clustered 00348 // among a collection of axis-aligned low dimensional flats in 00349 // the hypercube [-1,1]^d. A set of n_clus orthogonal flats are 00350 // generated, each whose dimension is a random number between 1 00351 // and max_dim. The points are evenly distributed among the clusters. 00352 // For each cluster, we generate points uniformly distributed along 00353 // the flat within the hypercube. 00354 // 00355 // This is done as follows. Each cluster is defined by a d-element 00356 // control vector whose components are either: 00357 // 00358 // CO_FLAG indicating that this component is to be generated 00359 // uniformly in [-1,1], 00360 // x a value other than CO_FLAG in the range [-1,1], 00361 // which indicates that this coordinate is to be 00362 // generated as x plus a Gaussian random deviation 00363 // with the given standard deviation. 00364 // 00365 // The number of zero components is the dimension of the flat, which 00366 // is a random integer in the range from 1 to max_dim. The points 00367 // are disributed between clusters in nearly equal sized groups. 00368 // 00369 // Note: Once cluster centers have been set, they are not changed, 00370 // unless new_clust = true. This is so that subsequent calls generate 00371 // points from the same distribution. It follows, of course, that any 00372 // attempt to change the dimension or number of clusters without 00373 // generating new clusters is asking for trouble. 00374 // 00375 // To make this a bad scenario at query time, query points should be 00376 // selected from a different distribution, e.g. uniform or Gaussian. 00377 // 00378 // We use a little programming trick to generate groups of roughly 00379 // equal size. If n is the total number of points, and n_clus is 00380 // the number of clusters, then the c-th cluster (0 <= c < n_clus) 00381 // is given floor((n+c)/n_clus) points. It can be shown that this 00382 // will exactly consume all n points. 00383 // 00384 // This procedure makes use of the utility procedure, genOrthFlat 00385 // which generates points in one orthogonal flat, according to 00386 // the given control vector. 00387 // 00388 //---------------------------------------------------------------------- 00389 const double CO_FLAG = 999; // special flag value 00390 00391 static void genOrthFlat( // generate points on an orthog flat 00392 ANNpointArray pa, // point array 00393 int n, // number of points 00394 int dim, // dimension 00395 double *control, // control vector 00396 double std_dev) // standard deviation 00397 { 00398 for (int i = 0; i < n; i++) { // generate each point 00399 for (int d = 0; d < dim; d++) { // generate each coord 00400 if (control[d] == CO_FLAG) // dimension on flat 00401 pa[i][d] = (ANNcoord) annRanUnif(-1,1); 00402 else // dimension off flat 00403 pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + control[d]); 00404 } 00405 } 00406 } 00407 00408 void annClusOrthFlats( // clustered along orthogonal flats 00409 ANNpointArray pa, // point array (modified) 00410 int n, // number of points 00411 int dim, // dimension 00412 int n_clus, // number of colors 00413 ANNbool new_clust, // generate new clusters. 00414 double std_dev, // standard deviation within clusters 00415 int max_dim) // maximum dimension of the flats 00416 { 00417 static ANNpointArray control = NULL; // control vectors 00418 00419 if (control == NULL || new_clust) { // need new cluster centers 00420 if (control != NULL) { // clusters already exist 00421 annDeallocPts(control); // get rid of them 00422 } 00423 control = annAllocPts(n_clus, dim); 00424 00425 for (int c = 0; c < n_clus; c++) { // generate clusters 00426 int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat 00427 for (int d = 0; d < dim; d++) { // generate side locations 00428 // prob. of picking next dim 00429 double Prob = ((double) n_dim)/((double) (dim-d)); 00430 if (annRan0() < Prob) { // add this one to flat 00431 control[c][d] = CO_FLAG; // flag this entry 00432 n_dim--; // one fewer dim to fill 00433 } 00434 else { // don't take this one 00435 control[c][d] = annRanUnif(-1,1);// random value in [-1,1] 00436 } 00437 } 00438 } 00439 } 00440 int offset = 0; // offset in pa array 00441 for (int c = 0; c < n_clus; c++) { // generate clusters 00442 int pick = (n+c)/n_clus; // number of points to pick 00443 // generate the points 00444 genOrthFlat(pa+offset, pick, dim, control[c], std_dev); 00445 offset += pick; // increment offset 00446 } 00447 } 00448 00449 //---------------------------------------------------------------------- 00450 // annClusEllipsoids - points clustered around axis-aligned ellipsoids 00451 // 00452 // This distribution consists of a collection points clustered 00453 // among a collection of low dimensional ellipsoids whose axes 00454 // are alligned with the coordinate axes in the hypercube [-1,1]^d. 00455 // The objective is to model distributions in which the points are 00456 // distributed in lower dimensional subspaces, and within this 00457 // lower dimensional space the points are distributed with a 00458 // Gaussian distribution (with no correlation between the 00459 // dimensions). 00460 // 00461 // The distribution is given the number of clusters or "colors" 00462 // (n_clus), maximum number of dimensions (max_dim) of the lower 00463 // dimensional subspace, a "small" standard deviation 00464 // (std_dev_small), and a "large" standard deviation range 00465 // (std_dev_lo, std_dev_hi). 00466 // 00467 // The algorithm generates n_clus cluster centers uniformly from 00468 // the hypercube [-1,1]^d. For each cluster, it selects the 00469 // dimension of the subspace as a random number r between 1 and 00470 // max_dim. These are the dimensions of the ellipsoid. Then it 00471 // generates a d-element std dev vector whose entries are the 00472 // standard deviation for the coordinates of each cluster in the 00473 // distribution. Among the d-element control vector, r randomly 00474 // chosen values are chosen uniformly from the range [std_dev_lo, 00475 // std_dev_hi]. The remaining values are set to std_dev_small. 00476 // 00477 // Note that annClusGaussPts is a special case of this in which 00478 // max_dim = 0, and std_dev = std_dev_small. 00479 // 00480 // If the flag new_clust is set, then new cluster centers are 00481 // generated. 00482 // 00483 // This procedure makes use of the utility procedure genGauss 00484 // which generates points distributed according to a Gaussian 00485 // distribution. 00486 // 00487 //---------------------------------------------------------------------- 00488 00489 static void genGauss( // generate points on a general Gaussian 00490 ANNpointArray pa, // point array 00491 int n, // number of points 00492 int dim, // dimension 00493 double *center, // center vector 00494 double *std_dev) // standard deviation vector 00495 { 00496 for (int i = 0; i < n; i++) { 00497 for (int d = 0; d < dim; d++) { 00498 pa[i][d] = (ANNcoord) (std_dev[d]*annRanGauss() + center[d]); 00499 } 00500 } 00501 } 00502 00503 void annClusEllipsoids( // clustered around ellipsoids 00504 ANNpointArray pa, // point array (modified) 00505 int n, // number of points 00506 int dim, // dimension 00507 int n_clus, // number of colors 00508 ANNbool new_clust, // generate new clusters. 00509 double std_dev_small, // small standard deviation 00510 double std_dev_lo, // low standard deviation for ellipses 00511 double std_dev_hi, // high standard deviation for ellipses 00512 int max_dim) // maximum dimension of the flats 00513 { 00514 static ANNpointArray centers = NULL; // cluster centers 00515 static ANNpointArray std_dev = NULL; // standard deviations 00516 00517 if (centers == NULL || new_clust) { // need new cluster centers 00518 if (centers != NULL) // clusters already exist 00519 annDeallocPts(centers); // get rid of them 00520 if (std_dev != NULL) // std deviations already exist 00521 annDeallocPts(std_dev); // get rid of them 00522 00523 centers = annAllocPts(n_clus, dim); // alloc new clusters and devs 00524 std_dev = annAllocPts(n_clus, dim); 00525 00526 for (int i = 0; i < n_clus; i++) { // gen cluster center coords 00527 for (int d = 0; d < dim; d++) { 00528 centers[i][d] = (ANNcoord) annRanUnif(-1,1); 00529 } 00530 } 00531 for (int c = 0; c < n_clus; c++) { // generate cluster std dev 00532 int n_dim = 1 + annRanInt(max_dim); // number of dimensions in flat 00533 for (int d = 0; d < dim; d++) { // generate std dev's 00534 // prob. of picking next dim 00535 double Prob = ((double) n_dim)/((double) (dim-d)); 00536 if (annRan0() < Prob) { // add this one to ellipse 00537 // generate random std dev 00538 std_dev[c][d] = annRanUnif(std_dev_lo, std_dev_hi); 00539 n_dim--; // one fewer dim to fill 00540 } 00541 else { // don't take this one 00542 std_dev[c][d] = std_dev_small;// use small std dev 00543 } 00544 } 00545 } 00546 } 00547 00548 int offset = 0; // next slot to fill 00549 for (int c = 0; c < n_clus; c++) { // generate clusters 00550 int pick = (n+c)/n_clus; // number of points to pick 00551 // generate the points 00552 genGauss(pa+offset, pick, dim, centers[c], std_dev[c]); 00553 offset += pick; // increment offset in array 00554 } 00555 } 00556 00557 //---------------------------------------------------------------------- 00558 // annPlanted - Generates points from a "planted" distribution 00559 // In high dimensional spaces, interpoint distances tend to be 00560 // highly clustered around the mean value. Approximate nearest 00561 // neighbor searching makes little sense in this context, unless it 00562 // is the case that each query point is significantly closer to its 00563 // nearest neighbor than to other points. Thus, the query points 00564 // should be planted close to the data points. Given a source data 00565 // set, this procedure generates a set of query points having this 00566 // property. 00567 // 00568 // We are given a source data array and a standard deviation. We 00569 // generate points as follows. We select a random point from the 00570 // source data set, and we generate a Gaussian point centered about 00571 // this random point and perturbed by a normal distributed random 00572 // variable with mean zero and the given standard deviation along 00573 // each coordinate. 00574 // 00575 // Note that this essentially the same a clustered Gaussian 00576 // distribution, but where the cluster centers are given by the 00577 // source data set. 00578 //---------------------------------------------------------------------- 00579 00580 void annPlanted( // planted nearest neighbors 00581 ANNpointArray pa, // point array (modified) 00582 int n, // number of points 00583 int dim, // dimension 00584 ANNpointArray src, // source point array 00585 int n_src, // source size 00586 double std_dev) // standard deviation about source 00587 { 00588 for (int i = 0; i < n; i++) { 00589 int c = annRanInt(n_src); // generate source index 00590 for (int d = 0; d < dim; d++) { 00591 pa[i][d] = (ANNcoord) (std_dev*annRanGauss() + src[c][d]); 00592 } 00593 } 00594 }