00001 //---------------------------------------------------------------------- 00002 // File: ann_test.cpp 00003 // Programmer: Sunil Arya and David Mount 00004 // Description: test program for ANN (approximate nearest neighbors) 00005 // Last modified: 08/04/06 (Version 1.1.1) 00006 //---------------------------------------------------------------------- 00007 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 00008 // David Mount. All Rights Reserved. 00009 // 00010 // This software and related documentation is part of the Approximate 00011 // Nearest Neighbor Library (ANN). This software is provided under 00012 // the provisions of the Lesser GNU Public License (LGPL). See the 00013 // file ../ReadMe.txt for further information. 00014 // 00015 // The University of Maryland (U.M.) and the authors make no 00016 // representations about the suitability or fitness of this software for 00017 // any purpose. It is provided "as is" without express or implied 00018 // warranty. 00019 //---------------------------------------------------------------------- 00020 // History: 00021 // Revision 0.1 03/04/98 00022 // Initial release 00023 // Revision 0.2 06/26/98 00024 // Added CLOCKS_PER_SEC definition if needed 00025 // Revision 1.0 04/01/05 00026 // Added comments (from "#" to eol) 00027 // Added clus_orth_flats and clus_ellipsoids distributions 00028 // Fixed order of fair and midpt in split_table 00029 // Added dump/load operations 00030 // Cleaned up C++ for modern compilers 00031 // Revision 1.1 05/03/05 00032 // Added fixed radius kNN search 00033 // Revision 1.1.1 08/04/06 00034 // Added planted distribution 00035 //---------------------------------------------------------------------- 00036 00037 #include <ctime> // clock 00038 #include <cmath> // math routines 00039 #include <string> // C string ops 00040 #include <fstream> // file I/O 00041 00042 #include <ANN/ANN.h> // ANN declarations 00043 #include <ANN/ANNx.h> // more ANN declarations 00044 #include <ANN/ANNperf.h> // performance evaluation 00045 00046 #include "rand.h" // random point generation 00047 00048 #ifndef CLOCKS_PER_SEC // define clocks-per-second if needed 00049 #define CLOCKS_PER_SEC 1000000 00050 #endif 00051 00052 using namespace std; // make std:: available 00053 00054 //---------------------------------------------------------------------- 00055 // ann_test 00056 // 00057 // This program is a driver for testing and evaluating the ANN library 00058 // for computing approximate nearest neighbors. It allows the user to 00059 // generate data and query sets of various sizes, dimensions, and 00060 // distributions, to build kd- and bbd-trees of various types, and then 00061 // run queries and outputting various performance statistics. 00062 // 00063 // Overview: 00064 // --------- 00065 // The test program is run as follows: 00066 // 00067 // ann_test < test_input > test_output 00068 // 00069 // where the test_input file contains a list of directives as described 00070 // below. Directives consist of a directive name, followed by list of 00071 // arguments (depending on the directive). Arguments and directives are 00072 // separated by white space (blank, tab, and newline). String arguments 00073 // are not quoted, and consist of a string of nonwhite chacters. A 00074 // character "#" denotes a comment. The following characters up to 00075 // the end of line are ignored. Comments may only be inserted between 00076 // directives (not within the argument list of a directive). 00077 // 00078 // Basic operations: 00079 // ----------------- 00080 // The test program can perform the following operations. How these 00081 // operations are performed depends on the options which are described 00082 // later. 00083 // 00084 // Data Generation: 00085 // ---------------- 00086 // read_data_pts <file> Create a set of data points whose 00087 // coordinates are input from file <file>. 00088 // gen_data_pts Create a set of data points whose 00089 // coordinates are generated from the 00090 // current point distribution. 00091 // 00092 // Building the tree: 00093 // ------------------ 00094 // build_ann Generate an approximate nearest neighbor 00095 // structure for the current data set, using 00096 // the selected splitting rules. Any existing 00097 // tree will be destroyed. 00098 // 00099 // Query Generation/Searching: 00100 // --------------------------- 00101 // read_query_pts <file> Create a set of query points whose 00102 // coordinates are input from file <file>. 00103 // gen_query_pts Create a set of query points whose 00104 // coordinates are generated from the 00105 // current point distribution. 00106 // run_queries <string> Apply nearest neighbor searching to the 00107 // query points using the approximate nearest 00108 // neighbor structure and the given search 00109 // strategy. Possible strategies are: 00110 // standard = standard kd-tree search 00111 // priority = priority search 00112 // 00113 // Miscellaneous: 00114 // -------------- 00115 // output_label Output a label to the output file. 00116 // dump <file> Dump the current structure to given file. 00117 // (The dump format is explained further in 00118 // the source file kd_tree.cc.) 00119 // load <file> Load a tree from a data file which was 00120 // created by the dump operation. Any 00121 // existing tree will be destroyed. 00122 // 00123 // Options: 00124 // -------- 00125 // How these operations are performed depends on a set of options. 00126 // If an option is not specified, a default value is used. An option 00127 // retains its value until it is set again. String inputs are not 00128 // enclosed in quotes, and must contain no embedded white space (sorry, 00129 // this is C++'s convention). 00130 // 00131 // Options affecting search tree structure: 00132 // ---------------------------------------- 00133 // split_rule <type> Type of splitting rule to use in building 00134 // the search tree. Choices are: 00135 // kd = optimized kd-tree 00136 // midpt = midpoint split 00137 // fair = fair split 00138 // sl_midpt = sliding midpt split 00139 // sl_fair = sliding fair split 00140 // suggest = authors' choice for best 00141 // The default is "suggest". See the file 00142 // kd_split.cc for more detailed information. 00143 // 00144 // shrink_rule <type> Type of shrinking rule to use in building 00145 // a bd-tree data structure. If "none" is 00146 // given, then no shrinking is performed and 00147 // the result is a kd-tree. Choices are: 00148 // none = perform no shrinking 00149 // simple = simple shrinking 00150 // centroid = centroid shrinking 00151 // suggest = authors' choice for best 00152 // The default is "none". See the file 00153 // bd_tree.cc for more information. 00154 // bucket_size <int> Bucket size, that is, the maximum number of 00155 // points stored in each leaf node. 00156 // 00157 // Options affecting data and query point generation: 00158 // -------------------------------------------------- 00159 // dim <int> Dimension of space. 00160 // seed <int> Seed for random number generation. 00161 // data_size <int> Number of data points. When reading data 00162 // points from a file, this indicates the 00163 // maximum number of points for storage 00164 // allocation. Default = 100. 00165 // query_size <int> Same as data_size for query points. 00166 // std_dev <float> Standard deviation (used in gauss, 00167 // planted, and clustered distributions). 00168 // This is the "small" distribution for 00169 // clus_ellipsoids. Default = 1. 00170 // std_dev_lo <float> Low and high standard deviations (used in 00171 // std_dev_hi <float> clus_ellipsoids). Default = 1. 00172 // corr_coef <float> Correlation coefficient (used in co-gauss 00173 // and co_lapace distributions). Default = 0.05. 00174 // colors <int> Number of color classes (clusters) (used 00175 // in the clustered distributions). Default = 5. 00176 // new_clust Once generated, cluster centers are not 00177 // normally regenerated. This is so that both 00178 // query points and data points can be generated 00179 // using the same set of clusters. This option 00180 // forces new cluster centers to be generated 00181 // with the next generation of either data or 00182 // query points. 00183 // max_clus_dim <int> Maximum dimension of clusters (used in 00184 // clus_orth_flats and clus_ellipsoids). 00185 // Default = 1. 00186 // distribution <string> Type of input distribution 00187 // uniform = uniform over cube [-1,1]^d. 00188 // gauss = Gaussian with mean 0 00189 // laplace = Laplacian, mean 0 and var 1 00190 // co_gauss = correlated Gaussian 00191 // co_laplace = correlated Laplacian 00192 // clus_gauss = clustered Gaussian 00193 // clus_orth_flats = clusters of orth flats 00194 // clus_ellipsoids = clusters of ellipsoids 00195 // planted = planted distribution 00196 // See the file rand.cpp for further information. 00197 // 00198 // Options affecting nearest neighbor search: 00199 // ------------------------------------------ 00200 // epsilon <float> Error bound for approx. near neigh. search. 00201 // near_neigh <int> Number of nearest neighbors to compute. 00202 // max_pts_visit <int> Maximum number of points to visit before 00203 // terminating. (Used in applications where 00204 // real-time performance is important.) 00205 // (Default = 0, which means no limit.) 00206 // radius_bound <float> Sets an upper bound on the nearest 00207 // neighbor search radius. If the bound is 00208 // positive, then fixed-radius nearest 00209 // neighbor searching is performed, and the 00210 // count of the number of points in the 00211 // range is returned. If the bound is 00212 // zero, then standard search is used. 00213 // This can only be used with standard, not 00214 // priority, search. (Default = 0, which 00215 // means standard search.) 00216 // 00217 // Options affection general program behavior: 00218 // ------------------------------------------- 00219 // stats <string> Level of statistics output 00220 // silent = no output, 00221 // exec_time += execution time only 00222 // prep_stats += preprocessing statistics 00223 // query_stats += query performance stats 00224 // query_res += results of queries 00225 // show_pts += show the data points 00226 // show_struct += print search structure 00227 // validate <string> Validate experiment and compute average 00228 // error. Since validation causes exact 00229 // nearest neighbors to be computed by the 00230 // brute force method, this can take a long 00231 // time. Valid arguments are: 00232 // on = turn validation on 00233 // off = turn validation off 00234 // true_near_neigh <int> Number of true nearest neighbors to compute. 00235 // When validating, we compute the difference 00236 // in rank between each reported nearest neighbor 00237 // and the true nearest neighbor of the same 00238 // rank. Thus it is necessary to compute a 00239 // few more true nearest neighbors. By default 00240 // we compute 10 more than near_neigh. With 00241 // this option the exact number can be set. 00242 // (Used only when validating.) 00243 // 00244 // Example: 00245 // -------- 00246 // output_label test_run_0 # output label for this run 00247 // validate off # do not perform validation 00248 // dim 16 # points in dimension 16 00249 // stats query_stats # output performance statistics for queries 00250 // seed 121212 # random number seed 00251 // data_size 1000 00252 // distribution uniform 00253 // gen_data_pts # 1000 uniform data points in dim 16 00254 // query_size 100 00255 // std_dev 0.05 00256 // distribution clus_gauss 00257 // gen_query_pts # 100 points in 10 clusters with std_dev 0.05 00258 // bucket_size 2 00259 // split_rule kd 00260 // shrink_rule none 00261 // build_ann # kd-tree, bucket size 2 00262 // epsilon 0.1 00263 // near_neigh 5 00264 // max_pts_visit 100 # stop search if more than 100 points seen 00265 // run_queries standard # run queries; 5 nearest neighbors, 10% error 00266 // data_size 500 00267 // read_data_pts data.in # read up to 500 points from file data.in 00268 // split_rule sl_midpt 00269 // shrink_rule simple 00270 // build_ann # bd-tree; simple shrink, sliding midpoint split 00271 // epsilon 0 00272 // run_queries priority # run same queries; 0 allowable error 00273 // 00274 //------------------------------------------------------------------------ 00275 00276 //------------------------------------------------------------------------ 00277 // Constants 00278 //------------------------------------------------------------------------ 00279 00280 const int STRING_LEN = 500; // max string length 00281 const double ERR = 0.00001; // epsilon (for float compares) 00282 00283 //------------------------------------------------------------------------ 00284 // Enumerated values and conversions 00285 //------------------------------------------------------------------------ 00286 00287 typedef enum {DATA, QUERY} PtType; // point types 00288 00289 //------------------------------------------------------------------------ 00290 // Statistics output levels 00291 //------------------------------------------------------------------------ 00292 00293 typedef enum { // stat levels 00294 SILENT, // no output 00295 EXEC_TIME, // just execution time 00296 PREP_STATS, // preprocessing info 00297 QUERY_STATS, // query performance 00298 QUERY_RES, // query results 00299 SHOW_PTS, // show data points 00300 SHOW_STRUCT, // show tree structure 00301 N_STAT_LEVELS} // number of levels 00302 StatLev; 00303 00304 const char stat_table[N_STAT_LEVELS][STRING_LEN] = { 00305 "silent", // SILENT 00306 "exec_time", // EXEC_TIME 00307 "prep_stats", // PREP_STATS 00308 "query_stats", // QUERY_STATS 00309 "query_res", // QUERY_RES 00310 "show_pts", // SHOW_PTS 00311 "show_struct"}; // SHOW_STRUCT 00312 00313 //------------------------------------------------------------------------ 00314 // Distributions 00315 //------------------------------------------------------------------------ 00316 00317 typedef enum { // distributions 00318 UNIFORM, // uniform over cube [-1,1]^d. 00319 GAUSS, // Gaussian with mean 0 00320 LAPLACE, // Laplacian, mean 0 and var 1 00321 CO_GAUSS, // correlated Gaussian 00322 CO_LAPLACE, // correlated Laplacian 00323 CLUS_GAUSS, // clustered Gaussian 00324 CLUS_ORTH_FLATS, // clustered on orthog flats 00325 CLUS_ELLIPSOIDS, // clustered on ellipsoids 00326 PLANTED, // planted distribution 00327 N_DISTRIBS} 00328 Distrib; 00329 00330 const char distr_table[N_DISTRIBS][STRING_LEN] = { 00331 "uniform", // UNIFORM 00332 "gauss", // GAUSS 00333 "laplace", // LAPLACE 00334 "co_gauss", // CO_GAUSS 00335 "co_laplace", // CO_LAPLACE 00336 "clus_gauss", // CLUS_GAUSS 00337 "clus_orth_flats", // CLUS_ORTH_FLATS 00338 "clus_ellipsoids", // CLUS_ELLIPSOIS 00339 "planted"}; // PLANTED 00340 00341 //------------------------------------------------------------------------ 00342 // Splitting rules for kd-trees (see ANN.h for types) 00343 //------------------------------------------------------------------------ 00344 00345 const int N_SPLIT_RULES = 6; 00346 const char split_table[N_SPLIT_RULES][STRING_LEN] = { 00347 "standard", // standard optimized kd-tree 00348 "midpt", // midpoint split 00349 "fair", // fair split 00350 "sl_midpt", // sliding midpt split 00351 "sl_fair", // sliding fair split 00352 "suggest"}; // authors' choice for best 00353 00354 //------------------------------------------------------------------------ 00355 // Shrinking rules for bd-trees (see ANN.h for types) 00356 //------------------------------------------------------------------------ 00357 00358 const int N_SHRINK_RULES = 4; 00359 const char shrink_table[N_SHRINK_RULES][STRING_LEN] = { 00360 "none", // perform no shrinking (kd-tree) 00361 "simple", // simple shrinking 00362 "centroid", // centroid shrinking 00363 "suggest"}; // authors' choice for best 00364 00365 //---------------------------------------------------------------------- 00366 // Short utility functions 00367 // Error - general error routine 00368 // printPoint - print a point to standard output 00369 // lookUp - look up a name in table and return index 00370 //---------------------------------------------------------------------- 00371 00372 void Error( // error routine 00373 char *msg, // error message 00374 ANNerr level) // abort afterwards 00375 { 00376 if (level == ANNabort) { 00377 cerr << "ann_test: ERROR------->" << msg << "<-------------ERROR\n"; 00378 exit(1); 00379 } 00380 else { 00381 cerr << "ann_test: WARNING----->" << msg << "<-------------WARNING\n"; 00382 } 00383 } 00384 00385 void printPoint( // print point 00386 ANNpoint p, // the point 00387 int dim) // the dimension 00388 { 00389 cout << "["; 00390 for (int i = 0; i < dim; i++) { 00391 cout << p[i]; 00392 if (i < dim-1) cout << ","; 00393 } 00394 cout << "]"; 00395 } 00396 00397 int lookUp( // look up name in table 00398 const char *arg, // name to look up 00399 const char (*table)[STRING_LEN], // name table 00400 int size) // table size 00401 { 00402 int i; 00403 for (i = 0; i < size; i++) { 00404 if (!strcmp(arg, table[i])) return i; 00405 } 00406 return i; 00407 } 00408 00409 //------------------------------------------------------------------------ 00410 // Function declarations 00411 //------------------------------------------------------------------------ 00412 00413 void generatePts( // generate data/query points 00414 ANNpointArray &pa, // point array (returned) 00415 int n, // number of points 00416 PtType type, // point type 00417 ANNbool new_clust, // new cluster centers desired? 00418 ANNpointArray src = NULL, // source array (for PLANTED) 00419 int n_src = 0); // source size (for PLANTED) 00420 00421 void readPts( // read data/query points from file 00422 ANNpointArray &pa, // point array (returned) 00423 int &n, // number of points 00424 char *file_nm, // file name 00425 PtType type); // point type (DATA, QUERY) 00426 00427 void doValidation(); // perform validation 00428 void getTrueNN(); // compute true nearest neighbors 00429 00430 void treeStats( // print statistics on kd- or bd-tree 00431 ostream &out, // output stream 00432 ANNbool verbose); // print stats 00433 00434 //------------------------------------------------------------------------ 00435 // Default execution parameters 00436 //------------------------------------------------------------------------ 00437 const int extra_nn = 10; // how many extra true nn's? 00438 00439 const int def_dim = 2; // def dimension 00440 const int def_data_size = 100; // def data size 00441 const int def_query_size = 100; // def number of queries 00442 const int def_n_color = 5; // def number of colors 00443 const ANNbool def_new_clust = ANNfalse; // def new clusters flag 00444 const int def_max_dim = 1; // def max flat dimension 00445 const Distrib def_distr = UNIFORM; // def distribution 00446 const double def_std_dev = 1.00; // def standard deviation 00447 const double def_corr_coef = 0.05; // def correlation coef 00448 const int def_bucket_size = 1; // def bucket size 00449 const double def_epsilon = 0.0; // def error bound 00450 const int def_near_neigh = 1; // def number of near neighbors 00451 const int def_max_visit = 0; // def number of points visited 00452 const int def_rad_bound = 0; // def radius bound 00453 // def number of true nn's 00454 const int def_true_nn = def_near_neigh + extra_nn; 00455 const int def_seed = 0; // def seed for random numbers 00456 const ANNbool def_validate = ANNfalse; // def validation flag 00457 // def statistics output level 00458 const StatLev def_stats = QUERY_STATS; 00459 const ANNsplitRule // def splitting rule 00460 def_split = ANN_KD_SUGGEST; 00461 const ANNshrinkRule // def shrinking rule 00462 def_shrink = ANN_BD_NONE; 00463 00464 //------------------------------------------------------------------------ 00465 // Global variables - Execution options 00466 //------------------------------------------------------------------------ 00467 00468 int dim; // dimension 00469 int data_size; // data size 00470 int query_size; // number of queries 00471 int n_color; // number of colors 00472 ANNbool new_clust; // generate new clusters? 00473 int max_dim; // maximum flat dimension 00474 Distrib distr; // distribution 00475 double corr_coef; // correlation coef 00476 double std_dev; // standard deviation 00477 double std_dev_lo; // low standard deviation 00478 double std_dev_hi; // high standard deviation 00479 int bucket_size; // bucket size 00480 double epsilon; // error bound 00481 int near_neigh; // number of near neighbors 00482 int max_pts_visit; // max number of points to visit 00483 double radius_bound; // maximum radius search bound 00484 int true_nn; // number of true nn's 00485 ANNbool validate; // validation flag 00486 StatLev stats; // statistics output level 00487 ANNsplitRule split; // splitting rule 00488 ANNshrinkRule shrink; // shrinking rule 00489 00490 //------------------------------------------------------------------------ 00491 // More globals - pointers to dynamically allocated arrays and structures 00492 // 00493 // It is assumed that all these values are set to NULL when nothing 00494 // is allocated. 00495 // 00496 // data_pts, query_pts The data and query points 00497 // the_tree Points to the kd- or bd-tree for 00498 // nearest neighbor searching. 00499 // apx_nn_idx, apx_dists Record approximate near neighbor 00500 // indices and distances 00501 // apx_pts_in_range Counts of the number of points in 00502 // the in approx range, for fixed- 00503 // radius NN searching. 00504 // true_nn_idx, true_dists Record true near neighbor 00505 // indices and distances 00506 // min_pts_in_range, max_... Min and max counts of the number 00507 // of points in the in approximate 00508 // range. 00509 // valid_dirty To avoid repeated validation, 00510 // we only validate query results 00511 // once. This validation becomes 00512 // invalid, if a new tree, new data 00513 // points or new query points have 00514 // been generated. 00515 // tree_data_size The number of points in the 00516 // current tree. (This will be the 00517 // same a data_size unless points have 00518 // been added since the tree was 00519 // built.) 00520 // 00521 // The approximate and true nearest neighbor results are stored 00522 // in: apx_nn_idx, apx_dists, and true_nn_idx, true_dists. 00523 // They are really flattened 2-dimensional arrays. Each of these 00524 // arrays consists of query_size blocks, each of which contains 00525 // near_neigh (or true_nn) entries, one for each of the nearest 00526 // neighbors for a given query point. 00527 //------------------------------------------------------------------------ 00528 00529 ANNpointArray data_pts; // data points 00530 ANNpointArray query_pts; // query points 00531 ANNbd_tree* the_tree; // kd- or bd-tree search structure 00532 ANNidxArray apx_nn_idx; // storage for near neighbor indices 00533 ANNdistArray apx_dists; // storage for near neighbor distances 00534 int* apx_pts_in_range; // storage for no. of points in range 00535 ANNidxArray true_nn_idx; // true near neighbor indices 00536 ANNdistArray true_dists; // true near neighbor distances 00537 int* min_pts_in_range; // min points in approx range 00538 int* max_pts_in_range; // max points in approx range 00539 00540 ANNbool valid_dirty; // validation is no longer valid 00541 00542 //------------------------------------------------------------------------ 00543 // Initialize global parameters 00544 //------------------------------------------------------------------------ 00545 00546 void initGlobals() 00547 { 00548 dim = def_dim; // init execution parameters 00549 data_size = def_data_size; 00550 query_size = def_query_size; 00551 distr = def_distr; 00552 corr_coef = def_corr_coef; 00553 std_dev = def_std_dev; 00554 std_dev_lo = def_std_dev; 00555 std_dev_hi = def_std_dev; 00556 new_clust = def_new_clust; 00557 max_dim = def_max_dim; 00558 n_color = def_n_color; 00559 bucket_size = def_bucket_size; 00560 epsilon = def_epsilon; 00561 near_neigh = def_near_neigh; 00562 max_pts_visit = def_max_visit; 00563 radius_bound = def_rad_bound; 00564 true_nn = def_true_nn; 00565 validate = def_validate; 00566 stats = def_stats; 00567 split = def_split; 00568 shrink = def_shrink; 00569 annIdum = -def_seed; // init. global seed for ran0() 00570 00571 data_pts = NULL; // initialize storage pointers 00572 query_pts = NULL; 00573 the_tree = NULL; 00574 apx_nn_idx = NULL; 00575 apx_dists = NULL; 00576 apx_pts_in_range = NULL; 00577 true_nn_idx = NULL; 00578 true_dists = NULL; 00579 min_pts_in_range = NULL; 00580 max_pts_in_range = NULL; 00581 00582 valid_dirty = ANNtrue; // (validation must be done) 00583 } 00584 00585 //------------------------------------------------------------------------ 00586 // getDirective - skip comments and read next directive 00587 // Returns ANNtrue if directive read, and ANNfalse if eof seen. 00588 //------------------------------------------------------------------------ 00589 00590 ANNbool skipComment( // skip any comments 00591 istream &in) // input stream 00592 { 00593 char ch = 0; 00594 // skip whitespace 00595 do { in.get(ch); } while (isspace(ch) && !in.eof()); 00596 while (ch == '#' && !in.eof()) { // comment? 00597 // skip to end of line 00598 do { in.get(ch); } while(ch != '\n' && !in.eof()); 00599 // skip whitespace 00600 do { in.get(ch); } while(isspace(ch) && !in.eof()); 00601 } 00602 if (in.eof()) return ANNfalse; // end of file 00603 in.putback(ch); // put character back 00604 return ANNtrue; 00605 } 00606 00607 ANNbool getDirective( 00608 istream &in, // input stream 00609 char *directive) // directive storage 00610 { 00611 if (!skipComment(in)) // skip comments 00612 return ANNfalse; // found eof along the way? 00613 in >> directive; // read directive 00614 return ANNtrue; 00615 } 00616 00617 00618 //------------------------------------------------------------------------ 00619 // main program - driver 00620 // The main program reads input options, invokes the necessary 00621 // routines to process them. 00622 //------------------------------------------------------------------------ 00623 00624 int main(int argc, char** argv) 00625 { 00626 long clock0; // clock time 00627 char directive[STRING_LEN]; // input directive 00628 char arg[STRING_LEN]; // all-purpose argument 00629 00630 cout << "------------------------------------------------------------\n" 00631 << "ann_test: Version " << ANNversion << " " << ANNversionCmt << "\n" 00632 << " Copyright: " << ANNcopyright << ".\n" 00633 << " Latest Revision: " << ANNlatestRev << ".\n" 00634 << "------------------------------------------------------------\n\n"; 00635 00636 initGlobals(); // initialize global values 00637 00638 //-------------------------------------------------------------------- 00639 // Main input loop 00640 //-------------------------------------------------------------------- 00641 // read input directive 00642 while (getDirective(cin, directive)) { 00643 //---------------------------------------------------------------- 00644 // Read options 00645 //---------------------------------------------------------------- 00646 if (!strcmp(directive,"dim")) { 00647 cin >> dim; 00648 } 00649 else if (!strcmp(directive,"colors")) { 00650 cin >> n_color; 00651 } 00652 else if (!strcmp(directive,"new_clust")) { 00653 new_clust = ANNtrue; 00654 } 00655 else if (!strcmp(directive,"max_clus_dim")) { 00656 cin >> max_dim; 00657 } 00658 else if (!strcmp(directive,"std_dev")) { 00659 cin >> std_dev; 00660 } 00661 else if (!strcmp(directive,"std_dev_lo")) { 00662 cin >> std_dev_lo; 00663 } 00664 else if (!strcmp(directive,"std_dev_hi")) { 00665 cin >> std_dev_hi; 00666 } 00667 else if (!strcmp(directive,"corr_coef")) { 00668 cin >> corr_coef; 00669 } 00670 else if (!strcmp(directive, "data_size")) { 00671 cin >> data_size; 00672 } 00673 else if (!strcmp(directive,"query_size")) { 00674 cin >> query_size; 00675 } 00676 else if (!strcmp(directive,"bucket_size")) { 00677 cin >> bucket_size; 00678 } 00679 else if (!strcmp(directive,"epsilon")) { 00680 cin >> epsilon; 00681 } 00682 else if (!strcmp(directive,"max_pts_visit")) { 00683 cin >> max_pts_visit; 00684 valid_dirty = ANNtrue; // validation must be redone 00685 } 00686 else if (!strcmp(directive,"radius_bound")) { 00687 cin >> radius_bound; 00688 valid_dirty = ANNtrue; // validation must be redone 00689 } 00690 else if (!strcmp(directive,"near_neigh")) { 00691 cin >> near_neigh; 00692 true_nn = near_neigh + extra_nn; // also reset true near neighs 00693 valid_dirty = ANNtrue; // validation must be redone 00694 } 00695 else if (!strcmp(directive,"true_near_neigh")) { 00696 cin >> true_nn; 00697 valid_dirty = ANNtrue; // validation must be redone 00698 } 00699 //---------------------------------------------------------------- 00700 // seed option 00701 // The seed is reset by setting the global annIdum to the 00702 // negation of the seed value. See rand.cpp. 00703 //---------------------------------------------------------------- 00704 else if (!strcmp(directive,"seed")) { 00705 cin >> annIdum; 00706 annIdum = -annIdum; 00707 } 00708 //---------------------------------------------------------------- 00709 // validate option 00710 //---------------------------------------------------------------- 00711 else if (!strcmp(directive,"validate")) { 00712 cin >> arg; // input argument 00713 if (!strcmp(arg, "on")) { 00714 validate = ANNtrue; 00715 cout << "validate = on " 00716 << "(Warning: this may slow execution time.)\n"; 00717 } 00718 else if (!strcmp(arg, "off")) { 00719 validate = ANNfalse; 00720 } 00721 else { 00722 cerr << "Argument: " << arg << "\n"; 00723 Error("validate argument must be \"on\" or \"off\"", ANNabort); 00724 } 00725 } 00726 //---------------------------------------------------------------- 00727 // distribution option 00728 //---------------------------------------------------------------- 00729 else if (!strcmp(directive,"distribution")) { 00730 cin >> arg; // input name and translate 00731 distr = (Distrib) lookUp(arg, distr_table, N_DISTRIBS); 00732 if (distr >= N_DISTRIBS) { // not something we recognize 00733 cerr << "Distribution: " << arg << "\n"; 00734 Error("Unknown distribution", ANNabort); 00735 } 00736 } 00737 //---------------------------------------------------------------- 00738 // stats option 00739 //---------------------------------------------------------------- 00740 else if (!strcmp(directive,"stats")) { 00741 cin >> arg; // input name and translate 00742 stats = (StatLev) lookUp(arg, stat_table, N_STAT_LEVELS); 00743 if (stats >= N_STAT_LEVELS) { // not something we recognize 00744 cerr << "Stats level: " << arg << "\n"; 00745 Error("Unknown statistics level", ANNabort); 00746 } 00747 if (stats > SILENT) 00748 cout << "stats = " << arg << "\n"; 00749 } 00750 //---------------------------------------------------------------- 00751 // split_rule option 00752 //---------------------------------------------------------------- 00753 else if (!strcmp(directive,"split_rule")) { 00754 cin >> arg; // input split_rule name 00755 split = (ANNsplitRule) lookUp(arg, split_table, N_SPLIT_RULES); 00756 if (split >= N_SPLIT_RULES) { // not something we recognize 00757 cerr << "Splitting rule: " << arg << "\n"; 00758 Error("Unknown splitting rule", ANNabort); 00759 } 00760 } 00761 //---------------------------------------------------------------- 00762 // shrink_rule option 00763 //---------------------------------------------------------------- 00764 else if (!strcmp(directive,"shrink_rule")) { 00765 cin >> arg; // input split_rule name 00766 shrink = (ANNshrinkRule) lookUp(arg, shrink_table, N_SHRINK_RULES); 00767 if (shrink >= N_SHRINK_RULES) { // not something we recognize 00768 cerr << "Shrinking rule: " << arg << "\n"; 00769 Error("Unknown shrinking rule", ANNabort); 00770 } 00771 } 00772 //---------------------------------------------------------------- 00773 // label operation 00774 //---------------------------------------------------------------- 00775 else if (!strcmp(directive,"output_label")) { 00776 cin >> arg; 00777 if (stats > SILENT) 00778 cout << "<" << arg << ">\n"; 00779 } 00780 //---------------------------------------------------------------- 00781 // gen_data_pts operation 00782 //---------------------------------------------------------------- 00783 else if (!strcmp(directive,"gen_data_pts")) { 00784 if (distr == PLANTED) { // planted distribution 00785 Error("Cannot use planted distribution for data points", ANNabort); 00786 } 00787 generatePts( // generate data points 00788 data_pts, // data points 00789 data_size, // data size 00790 DATA, // data points 00791 new_clust); // new clusters flag 00792 valid_dirty = ANNtrue; // validation must be redone 00793 new_clust = ANNfalse; // reset flag 00794 } 00795 //---------------------------------------------------------------- 00796 // gen_query_pts operation 00797 // If the distribution is PLANTED, then the query points 00798 // are planted near the data points (which must already be 00799 // generated). 00800 //---------------------------------------------------------------- 00801 else if (!strcmp(directive,"gen_query_pts")) { 00802 if (distr == PLANTED) { // planted distribution 00803 if (data_pts == NULL) { 00804 Error("Must generate data points before query points for planted distribution", ANNabort); 00805 } 00806 generatePts( // generate query points 00807 query_pts, // point array 00808 query_size, // number of query points 00809 QUERY, // query points 00810 new_clust, // new clusters flag 00811 data_pts, // plant around data pts 00812 data_size); 00813 } 00814 else { // all other distributions 00815 generatePts( // generate query points 00816 query_pts, // point array 00817 query_size, // number of query points 00818 QUERY, // query points 00819 new_clust); // new clusters flag 00820 } 00821 valid_dirty = ANNtrue; // validation must be redone 00822 new_clust = ANNfalse; // reset flag 00823 } 00824 //---------------------------------------------------------------- 00825 // read_data_pts operation 00826 //---------------------------------------------------------------- 00827 else if (!strcmp(directive,"read_data_pts")) { 00828 cin >> arg; // input file name 00829 readPts( 00830 data_pts, // point array 00831 data_size, // number of points 00832 arg, // file name 00833 DATA); // data points 00834 valid_dirty = ANNtrue; // validation must be redone 00835 } 00836 //---------------------------------------------------------------- 00837 // read_query_pts operation 00838 //---------------------------------------------------------------- 00839 else if (!strcmp(directive,"read_query_pts")) { 00840 cin >> arg; // input file name 00841 readPts( 00842 query_pts, // point array 00843 query_size, // number of points 00844 arg, // file name 00845 QUERY); // query points 00846 valid_dirty = ANNtrue; // validation must be redone 00847 } 00848 //---------------------------------------------------------------- 00849 // build_ann operation 00850 // We always invoke the constructor for bd-trees. Note 00851 // that when the shrinking rule is NONE (which is true by 00852 // default), then this constructs a kd-tree. 00853 //---------------------------------------------------------------- 00854 else if (!strcmp(directive,"build_ann")) { 00855 //------------------------------------------------------------ 00856 // Build the tree 00857 //------------------------------------------------------------ 00858 if (the_tree != NULL) { // tree exists already 00859 delete the_tree; // get rid of it 00860 } 00861 clock0 = clock(); // start time 00862 00863 the_tree = new ANNbd_tree( // build it 00864 data_pts, // the data points 00865 data_size, // number of points 00866 dim, // dimension of space 00867 bucket_size, // maximum bucket size 00868 split, // splitting rule 00869 shrink); // shrinking rule 00870 00871 //------------------------------------------------------------ 00872 // Print summary 00873 //------------------------------------------------------------ 00874 long prep_time = clock() - clock0; // end of prep time 00875 00876 if (stats > SILENT) { 00877 cout << "[Build ann-structure:\n"; 00878 cout << " split_rule = " << split_table[split] << "\n"; 00879 cout << " shrink_rule = " << shrink_table[shrink] << "\n"; 00880 cout << " data_size = " << data_size << "\n"; 00881 cout << " dim = " << dim << "\n"; 00882 cout << " bucket_size = " << bucket_size << "\n"; 00883 00884 if (stats >= EXEC_TIME) { // output processing time 00885 cout << " process_time = " 00886 << double(prep_time)/CLOCKS_PER_SEC << " sec\n"; 00887 } 00888 00889 if (stats >= PREP_STATS) // output or check tree stats 00890 treeStats(cout, ANNtrue); // print tree stats 00891 else 00892 treeStats(cout, ANNfalse); // check stats 00893 00894 if (stats >= SHOW_STRUCT) { // print the whole tree 00895 cout << " (Structure Contents:\n"; 00896 the_tree->Print(ANNfalse, cout); 00897 cout << " )\n"; 00898 } 00899 cout << "]\n"; 00900 } 00901 } 00902 //---------------------------------------------------------------- 00903 // dump operation 00904 //---------------------------------------------------------------- 00905 else if (!strcmp(directive,"dump")) { 00906 cin >> arg; // input file name 00907 if (the_tree == NULL) { // no tree 00908 Error("Cannot dump. No tree has been built yet", ANNwarn); 00909 } 00910 else { // there is a tree 00911 // try to open file 00912 ofstream out_dump_file(arg); 00913 if (!out_dump_file) { 00914 cerr << "File name: " << arg << "\n"; 00915 Error("Cannot open dump file", ANNabort); 00916 } 00917 // dump the tree and points 00918 the_tree->Dump(ANNtrue, out_dump_file); 00919 if (stats > SILENT) { 00920 cout << "(Tree has been dumped to file " << arg << ")\n"; 00921 } 00922 } 00923 } 00924 //---------------------------------------------------------------- 00925 // load operation 00926 // Since this not only loads a tree, but loads a new set 00927 // of data points. 00928 //---------------------------------------------------------------- 00929 else if (!strcmp(directive,"load")) { 00930 cin >> arg; // input file name 00931 if (the_tree != NULL) { // tree exists already 00932 delete the_tree; // get rid of it 00933 } 00934 if (data_pts != NULL) { // data points exist already 00935 delete data_pts; // get rid of them 00936 } 00937 00938 ifstream in_dump_file(arg); // try to open file 00939 if (!in_dump_file) { 00940 cerr << "File name: " << arg << "\n"; 00941 Error("Cannot open file for loading", ANNabort); 00942 } 00943 // build tree by loading 00944 the_tree = new ANNbd_tree(in_dump_file); 00945 00946 dim = the_tree->theDim(); // new dimension 00947 data_size = the_tree->nPoints(); // number of points 00948 data_pts = the_tree->thePoints(); // new points 00949 00950 valid_dirty = ANNtrue; // validation must be redone 00951 00952 if (stats > SILENT) { 00953 cout << "(Tree has been loaded from file " << arg << ")\n"; 00954 } 00955 if (stats >= SHOW_STRUCT) { // print the tree 00956 cout << " (Structure Contents:\n"; 00957 the_tree->Print(ANNfalse, cout); 00958 cout << " )\n"; 00959 } 00960 } 00961 //---------------------------------------------------------------- 00962 // run_queries operation 00963 // This section does all the query processing. It consists 00964 // of the following subsections: 00965 // 00966 // ** input the argument (standard or priority) and output 00967 // the header describing the essential information. 00968 // ** allocate space for the results to be stored. 00969 // ** run the queries by invoking the appropriate search 00970 // procedure on the query points. Print nearest neighbor 00971 // if requested. 00972 // ** print final summaries 00973 // 00974 // The approach for processing multiple nearest neighbors is 00975 // pretty crude. We allocate an array whose size is the 00976 // product of the total number of queries times the number of 00977 // nearest neighbors (k), and then use each k consecutive 00978 // entries to store the results of each query. 00979 //---------------------------------------------------------------- 00980 else if (!strcmp(directive,"run_queries")) { 00981 00982 //------------------------------------------------------------ 00983 // Input arguments and print summary 00984 //------------------------------------------------------------ 00985 enum {STANDARD, PRIORITY} method; 00986 00987 cin >> arg; // input argument 00988 if (!strcmp(arg, "standard")) { 00989 method = STANDARD; 00990 } 00991 else if (!strcmp(arg, "priority")) { 00992 method = PRIORITY; 00993 } 00994 else { 00995 cerr << "Search type: " << arg << "\n"; 00996 Error("Search type must be \"standard\" or \"priority\"", 00997 ANNabort); 00998 } 00999 if (data_pts == NULL || query_pts == NULL) { 01000 Error("Either data set and query set not constructed", ANNabort); 01001 } 01002 if (the_tree == NULL) { 01003 Error("No search tree built.", ANNabort); 01004 } 01005 01006 //------------------------------------------------------------ 01007 // Set up everything 01008 //------------------------------------------------------------ 01009 01010 #ifdef ANN_PERF // performance only 01011 annResetStats(data_size); // reset statistics 01012 #endif 01013 01014 clock0 = clock(); // start time 01015 // deallocate existing storage 01016 if (apx_nn_idx != NULL) delete [] apx_nn_idx; 01017 if (apx_dists != NULL) delete [] apx_dists; 01018 if (apx_pts_in_range != NULL) delete [] apx_pts_in_range; 01019 // allocate apx answer storage 01020 apx_nn_idx = new ANNidx[near_neigh*query_size]; 01021 apx_dists = new ANNdist[near_neigh*query_size]; 01022 apx_pts_in_range = new int[query_size]; 01023 01024 annMaxPtsVisit(max_pts_visit); // set max points to visit 01025 01026 //------------------------------------------------------------ 01027 // Run the queries 01028 //------------------------------------------------------------ 01029 // pointers for current query 01030 ANNidxArray curr_nn_idx = apx_nn_idx; 01031 ANNdistArray curr_dists = apx_dists; 01032 01033 for (int i = 0; i < query_size; i++) { 01034 #ifdef ANN_PERF 01035 annResetCounts(); // reset counters 01036 #endif 01037 apx_pts_in_range[i] = 0; 01038 01039 if (radius_bound == 0) { // no radius bound 01040 if (method == STANDARD) { 01041 the_tree->annkSearch( 01042 query_pts[i], // query point 01043 near_neigh, // number of near neighbors 01044 curr_nn_idx, // nearest neighbors (returned) 01045 curr_dists, // distance (returned) 01046 epsilon); // error bound 01047 } 01048 else if (method == PRIORITY) { 01049 the_tree->annkPriSearch( 01050 query_pts[i], // query point 01051 near_neigh, // number of near neighbors 01052 curr_nn_idx, // nearest neighbors (returned) 01053 curr_dists, // distance (returned) 01054 epsilon); // error bound 01055 } 01056 else { 01057 Error("Internal error - invalid method", ANNabort); 01058 } 01059 } 01060 else { // use radius bound 01061 if (method != STANDARD) { 01062 Error("A nonzero radius bound assumes standard search", 01063 ANNwarn); 01064 } 01065 apx_pts_in_range[i] = the_tree->annkFRSearch( 01066 query_pts[i], // query point 01067 ANN_POW(radius_bound), // squared radius search bound 01068 near_neigh, // number of near neighbors 01069 curr_nn_idx, // nearest neighbors (returned) 01070 curr_dists, // distance (returned) 01071 epsilon); // error bound 01072 } 01073 curr_nn_idx += near_neigh; // increment current pointers 01074 curr_dists += near_neigh; 01075 01076 #ifdef ANN_PERF 01077 annUpdateStats(); // update stats 01078 #endif 01079 } 01080 01081 long query_time = clock() - clock0; // end of query time 01082 01083 if (validate) { // validation requested 01084 if (valid_dirty) getTrueNN(); // get true near neighbors 01085 doValidation(); // validate 01086 } 01087 01088 //------------------------------------------------------------ 01089 // Print summaries 01090 //------------------------------------------------------------ 01091 01092 if (stats > SILENT) { 01093 cout << "[Run Queries:\n"; 01094 cout << " query_size = " << query_size << "\n"; 01095 cout << " dim = " << dim << "\n"; 01096 cout << " search_method = " << arg << "\n"; 01097 cout << " epsilon = " << epsilon << "\n"; 01098 cout << " near_neigh = " << near_neigh << "\n"; 01099 if (max_pts_visit != 0) 01100 cout << " max_pts_visit = " << max_pts_visit << "\n"; 01101 if (radius_bound != 0) 01102 cout << " radius_bound = " << radius_bound << "\n"; 01103 if (validate) 01104 cout << " true_nn = " << true_nn << "\n"; 01105 01106 if (stats >= EXEC_TIME) { // print exec time summary 01107 cout << " query_time = " << 01108 double(query_time)/(query_size*CLOCKS_PER_SEC) 01109 << " sec/query"; 01110 #ifdef ANN_PERF 01111 cout << " (biased by perf measurements)"; 01112 #endif 01113 cout << "\n"; 01114 } 01115 01116 if (stats >= QUERY_STATS) { // output performance stats 01117 #ifdef ANN_PERF 01118 cout.flush(); 01119 annPrintStats(validate); 01120 #else 01121 cout << " (Performance statistics unavailable.)\n"; 01122 #endif 01123 } 01124 01125 if (stats >= QUERY_RES) { // output results 01126 cout << " (Query Results:\n"; 01127 cout << " Pt\tANN\tDist\n"; 01128 curr_nn_idx = apx_nn_idx; // subarray pointers 01129 curr_dists = apx_dists; 01130 // output nearest neighbors 01131 for (int i = 0; i < query_size; i++) { 01132 cout << " " << setw(4) << i; 01133 for (int j = 0; j < near_neigh; j++) { 01134 // exit if no more neighbors 01135 if (curr_nn_idx[j] == ANN_NULL_IDX) { 01136 cout << "\t[no other pts in radius bound]\n"; 01137 break; 01138 } 01139 else { // output point info 01140 cout << "\t" << curr_nn_idx[j] 01141 << "\t" << ANN_ROOT(curr_dists[j]) 01142 << "\n"; 01143 } 01144 } 01145 // output range count 01146 if (radius_bound != 0) { 01147 cout << " pts_in_radius_bound = " 01148 << apx_pts_in_range[i] << "\n"; 01149 } 01150 // increment subarray pointers 01151 curr_nn_idx += near_neigh; 01152 curr_dists += near_neigh; 01153 } 01154 cout << " )\n"; 01155 } 01156 cout << "]\n"; 01157 } 01158 } 01159 //---------------------------------------------------------------- 01160 // Unknown directive 01161 //---------------------------------------------------------------- 01162 else { 01163 cerr << "Directive: " << directive << "\n"; 01164 Error("Unknown directive", ANNabort); 01165 } 01166 } 01167 //-------------------------------------------------------------------- 01168 // End of input loop (deallocate stuff that was allocated) 01169 //-------------------------------------------------------------------- 01170 if (the_tree != NULL) delete the_tree; 01171 if (data_pts != NULL) annDeallocPts(data_pts); 01172 if (query_pts != NULL) annDeallocPts(query_pts); 01173 if (apx_nn_idx != NULL) delete [] apx_nn_idx; 01174 if (apx_dists != NULL) delete [] apx_dists; 01175 if (apx_pts_in_range != NULL) delete [] apx_pts_in_range; 01176 01177 annClose(); // close ANN 01178 01179 return EXIT_SUCCESS; 01180 } 01181 01182 //------------------------------------------------------------------------ 01183 // generatePts - call appropriate routine to generate points of a 01184 // given distribution. 01185 //------------------------------------------------------------------------ 01186 01187 void generatePts( 01188 ANNpointArray &pa, // point array (returned) 01189 int n, // number of points to generate 01190 PtType type, // point type 01191 ANNbool new_clust, // new cluster centers desired? 01192 ANNpointArray src, // source array (if distr=PLANTED) 01193 int n_src) // source size (if distr=PLANTED) 01194 { 01195 if (pa != NULL) annDeallocPts(pa); // get rid of any old points 01196 pa = annAllocPts(n, dim); // allocate point storage 01197 01198 switch (distr) { 01199 case UNIFORM: // uniform over cube [-1,1]^d. 01200 annUniformPts(pa, n, dim); 01201 break; 01202 case GAUSS: // Gaussian with mean 0 01203 annGaussPts(pa, n, dim, std_dev); 01204 break; 01205 case LAPLACE: // Laplacian, mean 0 and var 1 01206 annLaplacePts(pa, n, dim); 01207 break; 01208 case CO_GAUSS: // correlated Gaussian 01209 annCoGaussPts(pa, n, dim, corr_coef); 01210 break; 01211 case CO_LAPLACE: // correlated Laplacian 01212 annCoLaplacePts(pa, n, dim, corr_coef); 01213 break; 01214 case CLUS_GAUSS: // clustered Gaussian 01215 annClusGaussPts(pa, n, dim, n_color, new_clust, std_dev); 01216 break; 01217 case CLUS_ORTH_FLATS: // clustered on orthog flats 01218 annClusOrthFlats(pa, n, dim, n_color, new_clust, std_dev, max_dim); 01219 break; 01220 case CLUS_ELLIPSOIDS: // clustered ellipsoids 01221 annClusEllipsoids(pa, n, dim, n_color, new_clust, std_dev, 01222 std_dev_lo, std_dev_hi, max_dim); 01223 break; 01224 case PLANTED: // planted distribution 01225 annPlanted(pa, n, dim, src, n_src, std_dev); 01226 break; 01227 default: 01228 Error("INTERNAL ERROR: Unknown distribution", ANNabort); 01229 break; 01230 } 01231 01232 if (stats > SILENT) { 01233 if(type == DATA) cout << "[Generating Data Points:\n"; 01234 else cout << "[Generating Query Points:\n"; 01235 cout << " number = " << n << "\n"; 01236 cout << " dim = " << dim << "\n"; 01237 cout << " distribution = " << distr_table[distr] << "\n"; 01238 if (annIdum < 0) 01239 cout << " seed = " << annIdum << "\n"; 01240 if (distr == GAUSS || distr == CLUS_GAUSS 01241 || distr == CLUS_ORTH_FLATS) 01242 cout << " std_dev = " << std_dev << "\n"; 01243 if (distr == CLUS_ELLIPSOIDS) { 01244 cout << " std_dev = " << std_dev << " (small) \n"; 01245 cout << " std_dev_lo = " << std_dev_lo << "\n"; 01246 cout << " std_dev_hi = " << std_dev_hi << "\n"; 01247 } 01248 if (distr == CO_GAUSS || distr == CO_LAPLACE) 01249 cout << " corr_coef = " << corr_coef << "\n"; 01250 if (distr == CLUS_GAUSS || distr == CLUS_ORTH_FLATS 01251 || distr == CLUS_ELLIPSOIDS) { 01252 cout << " colors = " << n_color << "\n"; 01253 if (new_clust) 01254 cout << " (cluster centers regenerated)\n"; 01255 } 01256 if (distr == CLUS_ORTH_FLATS || distr == CLUS_ELLIPSOIDS) { 01257 cout << " max_dim = " << max_dim << "\n"; 01258 } 01259 } 01260 // want to see points? 01261 if ((type == DATA && stats >= SHOW_PTS) || 01262 (type == QUERY && stats >= QUERY_RES)) { 01263 if(type == DATA) cout << "(Data Points:\n"; 01264 else cout << "(Query Points:\n"; 01265 for (int i = 0; i < n; i++) { 01266 cout << " " << setw(4) << i << "\t"; 01267 printPoint(pa[i], dim); 01268 cout << "\n"; 01269 } 01270 cout << " )\n"; 01271 } 01272 cout << "]\n"; 01273 } 01274 01275 //------------------------------------------------------------------------ 01276 // readPts - read a collection of data or query points. 01277 //------------------------------------------------------------------------ 01278 01279 void readPts( 01280 ANNpointArray &pa, // point array (returned) 01281 int &n, // number of points 01282 char *file_nm, // file name 01283 PtType type) // point type (DATA, QUERY) 01284 { 01285 int i; 01286 //-------------------------------------------------------------------- 01287 // Open input file and read points 01288 //-------------------------------------------------------------------- 01289 ifstream in_file(file_nm); // try to open data file 01290 if (!in_file) { 01291 cerr << "File name: " << file_nm << "\n"; 01292 Error("Cannot open input data/query file", ANNabort); 01293 } 01294 // allocate storage for points 01295 if (pa != NULL) annDeallocPts(pa); // get rid of old points 01296 pa = annAllocPts(n, dim); 01297 01298 for (i = 0; i < n; i++) { // read the data 01299 if (!(in_file >> pa[i][0])) break; 01300 for (int d = 1; d < dim; d++) { 01301 in_file >> pa[i][d]; 01302 } 01303 } 01304 01305 char ignore_me; // character for EOF test 01306 in_file >> ignore_me; // try to get one more character 01307 if (!in_file.eof()) { // exhausted space before eof 01308 if (type == DATA) 01309 Error("`data_size' too small. Input file truncated.", ANNwarn); 01310 else 01311 Error("`query_size' too small. Input file truncated.", ANNwarn); 01312 } 01313 n = i; // number of points read 01314 01315 //-------------------------------------------------------------------- 01316 // Print summary 01317 //-------------------------------------------------------------------- 01318 if (stats > SILENT) { 01319 if (type == DATA) { 01320 cout << "[Read Data Points:\n"; 01321 cout << " data_size = " << n << "\n"; 01322 } 01323 else { 01324 cout << "[Read Query Points:\n"; 01325 cout << " query_size = " << n << "\n"; 01326 } 01327 cout << " file_name = " << file_nm << "\n"; 01328 cout << " dim = " << dim << "\n"; 01329 // print if results requested 01330 if ((type == DATA && stats >= SHOW_PTS) || 01331 (type == QUERY && stats >= QUERY_RES)) { 01332 cout << " (Points:\n"; 01333 for (i = 0; i < n; i++) { 01334 cout << " " << i << "\t"; 01335 printPoint(pa[i], dim); 01336 cout << "\n"; 01337 } 01338 cout << " )\n"; 01339 } 01340 cout << "]\n"; 01341 } 01342 } 01343 01344 //------------------------------------------------------------------------ 01345 // getTrueNN 01346 // Computes the true nearest neighbors. For purposes of validation, 01347 // this intentionally done in a rather dumb (but safe way), by 01348 // invoking the brute-force search. 01349 // 01350 // The number of true nearest neighbors is somewhat larger than 01351 // the number of nearest neighbors. This is so that the validation 01352 // can determine the expected difference in element ranks. 01353 // 01354 // This procedure is invoked just prior to running queries. Since 01355 // the operation takes a long time, it is performed only if needed. 01356 // In particular, once generated, it will be regenerated only if 01357 // new query or data points are generated, or if the requested number 01358 // of true near neighbors or approximate near neighbors has changed. 01359 // 01360 // To validate fixed-radius searching, we compute two counts, one 01361 // with the original query radius (trueSqRadius) and the other with 01362 // a radius shrunken by the error factor (minSqradius). We then 01363 // check that the count of points inside the approximate range is 01364 // between these two bounds. Because fixed-radius search is 01365 // allowed to ignore points within the shrunken radius, we only 01366 // compute exact neighbors within this smaller distance (for we 01367 // cannot guarantee that we will even visit the other points). 01368 //------------------------------------------------------------------------ 01369 01370 void getTrueNN() // compute true nearest neighbors 01371 { 01372 if (stats > SILENT) { 01373 cout << "(Computing true nearest neighbors for validation. This may take time.)\n"; 01374 } 01375 // deallocate existing storage 01376 if (true_nn_idx != NULL) delete [] true_nn_idx; 01377 if (true_dists != NULL) delete [] true_dists; 01378 if (min_pts_in_range != NULL) delete [] min_pts_in_range; 01379 if (max_pts_in_range != NULL) delete [] max_pts_in_range; 01380 01381 if (true_nn > data_size) { // can't get more nn than points 01382 true_nn = data_size; 01383 } 01384 01385 // allocate true answer storage 01386 true_nn_idx = new ANNidx[true_nn*query_size]; 01387 true_dists = new ANNdist[true_nn*query_size]; 01388 min_pts_in_range = new int[query_size]; 01389 max_pts_in_range = new int[query_size]; 01390 01391 ANNidxArray curr_nn_idx = true_nn_idx; // current locations in arrays 01392 ANNdistArray curr_dists = true_dists; 01393 01394 // allocate search structure 01395 ANNbruteForce *the_brute = new ANNbruteForce(data_pts, data_size, dim); 01396 // compute nearest neighbors 01397 for (int i = 0; i < query_size; i++) { 01398 if (radius_bound == 0) { // standard kNN search 01399 the_brute->annkSearch( // compute true near neighbors 01400 query_pts[i], // query point 01401 true_nn, // number of nearest neighbors 01402 curr_nn_idx, // where to put indices 01403 curr_dists); // where to put distances 01404 } 01405 else { // fixed radius kNN search 01406 // search radii limits 01407 ANNdist trueSqRadius = ANN_POW(radius_bound); 01408 ANNdist minSqRadius = ANN_POW(radius_bound / (1+epsilon)); 01409 min_pts_in_range[i] = the_brute->annkFRSearch( 01410 query_pts[i], // query point 01411 minSqRadius, // shrunken search radius 01412 true_nn, // number of near neighbors 01413 curr_nn_idx, // nearest neighbors (returned) 01414 curr_dists); // distance (returned) 01415 max_pts_in_range[i] = the_brute->annkFRSearch( 01416 query_pts[i], // query point 01417 trueSqRadius, // true search radius 01418 0, NULL, NULL); // (ignore kNN info) 01419 } 01420 curr_nn_idx += true_nn; // increment nn index pointer 01421 curr_dists += true_nn; // increment nn dist pointer 01422 } 01423 delete the_brute; // delete brute-force struct 01424 valid_dirty = ANNfalse; // validation good for now 01425 } 01426 01427 //------------------------------------------------------------------------ 01428 // doValidation 01429 // Compares the approximate answers to the k-nearest neighbors 01430 // against the true nearest neighbors (computed earlier). It is 01431 // assumed that the true nearest neighbors and indices have been 01432 // computed earlier. 01433 // 01434 // First, we check that all the results are within their allowed 01435 // limits, and generate an internal error, if not. For the sake of 01436 // performance evaluation, we also compute the following two 01437 // quantities for nearest neighbors: 01438 // 01439 // Average Error 01440 // ------------- 01441 // The relative error between the distance to a reported nearest 01442 // neighbor and the true nearest neighbor (of the same rank), 01443 // 01444 // Rank Error 01445 // ---------- 01446 // The difference in rank between the reported nearest neighbor and 01447 // its position (if any) among the true nearest neighbors. If we 01448 // cannot find this point among the true nearest neighbors, then 01449 // it assumed that the rank of the true nearest neighbor is true_nn+1. 01450 // 01451 // Because of the possibility of duplicate distances, this is computed 01452 // as follows. For the j-th reported nearest neighbor, we count the 01453 // number of true nearest neighbors that are at least this close. Let 01454 // this be rnk. Then the rank error is max(0, j-rnk). (In the code 01455 // below, j is an array index and so the first item is 0, not 1. Thus 01456 // we take max(0, j+1-rnk) instead.) 01457 // 01458 // For the results of fixed-radious range count, we verify that the 01459 // reported number of points in the range lies between the actual 01460 // number of points in the shrunken and the true search radius. 01461 //------------------------------------------------------------------------ 01462 01463 void doValidation() // perform validation 01464 { 01465 int* curr_apx_idx = apx_nn_idx; // approx index pointer 01466 ANNdistArray curr_apx_dst = apx_dists; // approx distance pointer 01467 int* curr_tru_idx = true_nn_idx; // true index pointer 01468 ANNdistArray curr_tru_dst = true_dists; // true distance pointer 01469 int i, j; 01470 01471 if (true_nn < near_neigh) { 01472 Error("Cannot validate with fewer true near neighbors than actual", ANNabort); 01473 } 01474 01475 for (i = 0; i < query_size; i++) { // validate each query 01476 //---------------------------------------------------------------- 01477 // Compute result errors 01478 // In fixed radius search it is possible that not all k 01479 // nearest neighbors were computed. Because the true 01480 // results are computed over the shrunken radius, we should 01481 // have at least as many true nearest neighbors as 01482 // approximate nearest neighbors. (If not, an infinite 01483 // error will be generated, and so an internal error will 01484 // will be generated. 01485 // 01486 // Because nearest neighbors are sorted in increasing order 01487 // of distance, as soon as we see a null index, we can 01488 // terminate the distance checking. The error in the 01489 // result should not exceed epsilon. However, if 01490 // max_pts_visit is nonzero (meaning that the search is 01491 // terminated early) this might happen. 01492 //---------------------------------------------------------------- 01493 for (j = 0; j < near_neigh; j++) { 01494 if (curr_tru_idx[j] == ANN_NULL_IDX)// no more true neighbors? 01495 break; 01496 // true i-th smallest distance 01497 double true_dist = ANN_ROOT(curr_tru_dst[j]); 01498 // reported i-th smallest 01499 double rept_dist = ANN_ROOT(curr_apx_dst[j]); 01500 // better than optimum? 01501 if (rept_dist < true_dist*(1-ERR)) { 01502 Error("INTERNAL ERROR: True nearest neighbor incorrect", 01503 ANNabort); 01504 } 01505 01506 double resultErr; // result error 01507 if (true_dist == 0.0) { // let's not divide by zero 01508 if (rept_dist != 0.0) resultErr = ANN_DBL_MAX; 01509 else resultErr = 0.0; 01510 } 01511 else { 01512 resultErr = (rept_dist - true_dist) / ((double) true_dist); 01513 } 01514 01515 if (resultErr > epsilon && max_pts_visit == 0) { 01516 Error("INTERNAL ERROR: Actual error exceeds epsilon", 01517 ANNabort); 01518 } 01519 #ifdef ANN_PERF 01520 ann_average_err += resultErr; // update statistics error 01521 #endif 01522 } 01523 //-------------------------------------------------------------------- 01524 // Compute rank errors (only needed for perf measurements) 01525 //-------------------------------------------------------------------- 01526 #ifdef ANN_PERF 01527 for (j = 0; j < near_neigh; j++) { 01528 if (curr_tru_idx[i] == ANN_NULL_IDX) // no more true neighbors? 01529 break; 01530 01531 double rnkErr = 0.0; // rank error 01532 // reported j-th distance 01533 ANNdist rept_dist = curr_apx_dst[j]; 01534 int rnk = 0; // compute rank of this item 01535 while (rnk < true_nn && curr_tru_dst[rnk] <= rept_dist) 01536 rnk++; 01537 if (j+1-rnk > 0) rnkErr = (double) (j+1-rnk); 01538 ann_rank_err += rnkErr; // update average rank error 01539 } 01540 #endif 01541 //---------------------------------------------------------------- 01542 // Check range counts from fixed-radius query 01543 //---------------------------------------------------------------- 01544 if (radius_bound != 0) { // fixed-radius search 01545 if (apx_pts_in_range[i] < min_pts_in_range[i] || 01546 apx_pts_in_range[i] > max_pts_in_range[i]) 01547 Error("INTERNAL ERROR: Invalid fixed-radius range count", 01548 ANNabort); 01549 } 01550 01551 curr_apx_idx += near_neigh; 01552 curr_apx_dst += near_neigh; 01553 curr_tru_idx += true_nn; // increment current pointers 01554 curr_tru_dst += true_nn; 01555 } 01556 } 01557 01558 //---------------------------------------------------------------------- 01559 // treeStats 01560 // Computes a number of statistics related to kd_trees and 01561 // bd_trees. These statistics are printed if in verbose mode, 01562 // and otherwise they are only printed if they are deemed to 01563 // be outside of reasonable operating bounds. 01564 //---------------------------------------------------------------------- 01565 01566 #define log2(x) (log(x)/log(2.0)) // log base 2 01567 01568 void treeStats( 01569 ostream &out, // output stream 01570 ANNbool verbose) // print stats 01571 { 01572 const int MIN_PTS = 20; // min no. pts for checking 01573 const float MAX_FRAC_TL = 0.50; // max frac of triv leaves 01574 const float MAX_AVG_AR = 20; // max average aspect ratio 01575 01576 ANNkdStats st; // statistics structure 01577 01578 the_tree->getStats(st); // get statistics 01579 // total number of nodes 01580 int n_nodes = st.n_lf + st.n_spl + st.n_shr; 01581 // should be O(n/bs) 01582 int opt_n_nodes = (int) (2*(float(st.n_pts)/st.bkt_size)); 01583 int too_many_nodes = 10*opt_n_nodes; 01584 if (st.n_pts >= MIN_PTS && n_nodes > too_many_nodes) { 01585 out << "-----------------------------------------------------------\n"; 01586 out << "Warning: The tree has more than 10x as many nodes as points.\n"; 01587 out << "You may want to consider a different split or shrink method.\n"; 01588 out << "-----------------------------------------------------------\n"; 01589 verbose = ANNtrue; 01590 } 01591 // fraction of trivial leaves 01592 float frac_tl = (st.n_lf == 0 ? 0 : ((float) st.n_tl)/ st.n_lf); 01593 if (st.n_pts >= MIN_PTS && frac_tl > MAX_FRAC_TL) { 01594 out << "-----------------------------------------------------------\n"; 01595 out << "Warning: A significant fraction of leaves contain no points.\n"; 01596 out << "You may want to consider a different split or shrink method.\n"; 01597 out << "-----------------------------------------------------------\n"; 01598 verbose = ANNtrue; 01599 } 01600 // depth should be O(dim*log n) 01601 int too_many_levels = (int) (2.0 * st.dim * log2((double) st.n_pts)); 01602 int opt_levels = (int) log2(double(st.n_pts)/st.bkt_size); 01603 if (st.n_pts >= MIN_PTS && st.depth > too_many_levels) { 01604 out << "-----------------------------------------------------------\n"; 01605 out << "Warning: The tree is more than 2x as deep as (dim*log n).\n"; 01606 out << "You may want to consider a different split or shrink method.\n"; 01607 out << "-----------------------------------------------------------\n"; 01608 verbose = ANNtrue; 01609 } 01610 // average leaf aspect ratio 01611 if (st.n_pts >= MIN_PTS && st.avg_ar > MAX_AVG_AR) { 01612 out << "-----------------------------------------------------------\n"; 01613 out << "Warning: Average aspect ratio of cells is quite large.\n"; 01614 out << "This may slow queries depending on the point distribution.\n"; 01615 out << "-----------------------------------------------------------\n"; 01616 verbose = ANNtrue; 01617 } 01618 01619 //------------------------------------------------------------------ 01620 // Print summaries if requested 01621 //------------------------------------------------------------------ 01622 if (verbose) { // output statistics 01623 out << " (Structure Statistics:\n"; 01624 out << " n_nodes = " << n_nodes 01625 << " (opt = " << opt_n_nodes 01626 << ", best if < " << too_many_nodes << ")\n" 01627 << " n_leaves = " << st.n_lf 01628 << " (" << st.n_tl << " contain no points)\n" 01629 << " n_splits = " << st.n_spl << "\n" 01630 << " n_shrinks = " << st.n_shr << "\n"; 01631 out << " empty_leaves = " << frac_tl*100 01632 << " percent (best if < " << MAX_FRAC_TL*100 << " percent)\n"; 01633 out << " depth = " << st.depth 01634 << " (opt = " << opt_levels 01635 << ", best if < " << too_many_levels << ")\n"; 01636 out << " avg_aspect_ratio = " << st.avg_ar 01637 << " (best if < " << MAX_AVG_AR << ")\n"; 01638 out << " )\n"; 01639 } 01640 }