ann_test.cpp

Go to the documentation of this file.
00001 //----------------------------------------------------------------------
00002 //      File:                   ann_test.cpp
00003 //      Programmer:             Sunil Arya and David Mount
00004 //      Description:    test program for ANN (approximate nearest neighbors)
00005 //  Last modified:      08/04/06 (Version 1.1.1)
00006 //----------------------------------------------------------------------
00007 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
00008 // David Mount.  All Rights Reserved.
00009 // 
00010 // This software and related documentation is part of the Approximate
00011 // Nearest Neighbor Library (ANN).  This software is provided under
00012 // the provisions of the Lesser GNU Public License (LGPL).  See the
00013 // file ../ReadMe.txt for further information.
00014 // 
00015 // The University of Maryland (U.M.) and the authors make no
00016 // representations about the suitability or fitness of this software for
00017 // any purpose.  It is provided "as is" without express or implied
00018 // warranty.
00019 //----------------------------------------------------------------------
00020 // History:
00021 //      Revision 0.1  03/04/98
00022 //              Initial release
00023 //      Revision 0.2  06/26/98
00024 //              Added CLOCKS_PER_SEC definition if needed
00025 //      Revision 1.0  04/01/05
00026 //              Added comments (from "#" to eol)
00027 //              Added clus_orth_flats and clus_ellipsoids distributions
00028 //              Fixed order of fair and midpt in split_table
00029 //              Added dump/load operations
00030 //              Cleaned up C++ for modern compilers
00031 //      Revision 1.1  05/03/05
00032 //              Added fixed radius kNN search
00033 //      Revision 1.1.1 08/04/06
00034 //              Added planted distribution
00035 //----------------------------------------------------------------------
00036 
00037 #include <ctime>                                                // clock
00038 #include <cmath>                                                // math routines
00039 #include <string>                                               // C string ops
00040 #include <fstream>                                              // file I/O
00041 
00042 #include <ANN/ANN.h>                                    // ANN declarations
00043 #include <ANN/ANNx.h>                                   // more ANN declarations
00044 #include <ANN/ANNperf.h>                                // performance evaluation
00045 
00046 #include "rand.h"                                               // random point generation
00047 
00048 #ifndef CLOCKS_PER_SEC                                  // define clocks-per-second if needed
00049   #define CLOCKS_PER_SEC 1000000
00050 #endif
00051 
00052 using namespace std;                                    // make std:: available
00053 
00054 //----------------------------------------------------------------------
00055 // ann_test
00056 //
00057 // This program is a driver for testing and evaluating the ANN library
00058 // for computing approximate nearest neighbors.  It allows the user to
00059 // generate data and query sets of various sizes, dimensions, and
00060 // distributions, to build kd- and bbd-trees of various types, and then
00061 // run queries and outputting various performance statistics.
00062 //
00063 // Overview:
00064 // ---------
00065 // The test program is run as follows:
00066 // 
00067 //              ann_test < test_input > test_output
00068 //
00069 // where the test_input file contains a list of directives as described
00070 // below.  Directives consist of a directive name, followed by list of
00071 // arguments (depending on the directive).  Arguments and directives are
00072 // separated by white space (blank, tab, and newline).  String arguments
00073 // are not quoted, and consist of a string of nonwhite chacters.  A
00074 // character "#" denotes a comment.  The following characters up to
00075 // the end of line are ignored.  Comments may only be inserted between
00076 // directives (not within the argument list of a directive).
00077 //
00078 // Basic operations:
00079 // -----------------
00080 // The test program can perform the following operations.  How these
00081 // operations are performed depends on the options which are described
00082 // later.
00083 //
00084 //              Data Generation:
00085 //              ----------------
00086 //              read_data_pts <file>    Create a set of data points whose
00087 //                                                              coordinates are input from file <file>.
00088 //              gen_data_pts                    Create a set of data points whose
00089 //                                                              coordinates are generated from the
00090 //                                                              current point distribution.
00091 //
00092 //              Building the tree:
00093 //              ------------------
00094 //              build_ann                               Generate an approximate nearest neighbor
00095 //                                                              structure for the current data set, using
00096 //                                                              the selected splitting rules.  Any existing
00097 //                                                              tree will be destroyed.
00098 //
00099 //              Query Generation/Searching:
00100 //              ---------------------------
00101 //              read_query_pts <file>   Create a set of query points whose
00102 //                                                              coordinates are input from file <file>.
00103 //              gen_query_pts                   Create a set of query points whose
00104 //                                                              coordinates are generated from the
00105 //                                                              current point distribution.
00106 //              run_queries <string>    Apply nearest neighbor searching to the
00107 //                                                              query points using the approximate nearest
00108 //                                                              neighbor structure and the given search
00109 //                                                              strategy.  Possible strategies are:
00110 //                                                                      standard = standard kd-tree search
00111 //                                                                      priority = priority search
00112 //
00113 //              Miscellaneous:
00114 //              --------------
00115 //              output_label                    Output a label to the output file.
00116 //              dump <file>                             Dump the current structure to given file.
00117 //                                                              (The dump format is explained further in
00118 //                                                              the source file kd_tree.cc.)
00119 //              load <file>                             Load a tree from a data file which was
00120 //                                                              created by the dump operation.  Any
00121 //                                                              existing tree will be destroyed.
00122 //
00123 // Options:
00124 // --------
00125 // How these operations are performed depends on a set of options.
00126 // If an option is not specified, a default value is used. An option
00127 // retains its value until it is set again.  String inputs are not
00128 // enclosed in quotes, and must contain no embedded white space (sorry,
00129 // this is C++'s convention).
00130 //
00131 // Options affecting search tree structure:
00132 // ----------------------------------------
00133 //              split_rule <type>               Type of splitting rule to use in building
00134 //                                                              the search tree.  Choices are:
00135 //                                                                      kd                      = optimized kd-tree
00136 //                                                                      midpt           = midpoint split
00137 //                                                                      fair            = fair split
00138 //                                                                      sl_midpt        = sliding midpt split
00139 //                                                                      sl_fair         = sliding fair split
00140 //                                                                      suggest         = authors' choice for best
00141 //                                                              The default is "suggest".  See the file
00142 //                                                              kd_split.cc for more detailed information.
00143 //
00144 //              shrink_rule <type>              Type of shrinking rule to use in building
00145 //                                                              a bd-tree data structure.  If "none" is
00146 //                                                              given, then no shrinking is performed and
00147 //                                                              the result is a kd-tree.  Choices are:
00148 //                                                                      none            = perform no shrinking
00149 //                                                                      simple          = simple shrinking
00150 //                                                                      centroid        = centroid shrinking
00151 //                                                                      suggest         = authors' choice for best
00152 //                                                              The default is "none".  See the file
00153 //                                                              bd_tree.cc for more information.
00154 //              bucket_size <int>               Bucket size, that is, the maximum number of
00155 //                                                              points stored in each leaf node.
00156 //
00157 // Options affecting data and query point generation:
00158 // --------------------------------------------------
00159 //              dim <int>                               Dimension of space.
00160 //              seed <int>                              Seed for random number generation.
00161 //              data_size <int>                 Number of data points.  When reading data
00162 //                                                              points from a file, this indicates the
00163 //                                                              maximum number of points for storage
00164 //                                                              allocation. Default = 100.
00165 //              query_size <int>                Same as data_size for query points.
00166 //              std_dev <float>                 Standard deviation (used in gauss,
00167 //                                                              planted, and clustered distributions).
00168 //                                                              This is the "small" distribution for
00169 //                                                              clus_ellipsoids.  Default = 1.
00170 //              std_dev_lo <float>              Low and high standard deviations (used in
00171 //              std_dev_hi <float>              clus_ellipsoids).  Default = 1.
00172 //              corr_coef <float>               Correlation coefficient (used in co-gauss
00173 //                                                              and co_lapace distributions). Default = 0.05.
00174 //              colors <int>                    Number of color classes (clusters) (used
00175 //                                                              in the clustered distributions).  Default = 5.
00176 //              new_clust                               Once generated, cluster centers are not
00177 //                                                              normally regenerated.  This is so that both
00178 //                                                              query points and data points can be generated
00179 //                                                              using the same set of clusters.  This option
00180 //                                                              forces new cluster centers to be generated
00181 //                                                              with the next generation of either data or
00182 //                                                              query points.
00183 //              max_clus_dim <int>              Maximum dimension of clusters (used in
00184 //                                                              clus_orth_flats and clus_ellipsoids).
00185 //                                                              Default = 1.
00186 //              distribution <string>   Type of input distribution
00187 //                                                                      uniform         = uniform over cube [-1,1]^d.
00188 //                                                                      gauss           = Gaussian with mean 0
00189 //                                                                      laplace         = Laplacian, mean 0 and var 1
00190 //                                                                      co_gauss        = correlated Gaussian
00191 //                                                                      co_laplace      = correlated Laplacian
00192 //                                                                      clus_gauss      = clustered Gaussian
00193 //                                                                      clus_orth_flats = clusters of orth flats
00194 //                                                                      clus_ellipsoids = clusters of ellipsoids
00195 //                                                                      planted         = planted distribution
00196 //                                                              See the file rand.cpp for further information.
00197 //
00198 // Options affecting nearest neighbor search:
00199 // ------------------------------------------
00200 //              epsilon <float>                 Error bound for approx. near neigh. search.
00201 //              near_neigh <int>                Number of nearest neighbors to compute.
00202 //              max_pts_visit <int>             Maximum number of points to visit before
00203 //                                                              terminating.  (Used in applications where
00204 //                                                              real-time performance is important.)
00205 //                                                              (Default = 0, which means no limit.)
00206 //              radius_bound <float>    Sets an upper bound on the nearest
00207 //                                                              neighbor search radius.  If the bound is
00208 //                                                              positive, then fixed-radius nearest
00209 //                                                              neighbor searching is performed, and the
00210 //                                                              count of the number of points in the
00211 //                                                              range is returned.  If the bound is
00212 //                                                              zero, then standard search is used.
00213 //                                                              This can only be used with standard, not
00214 //                                                              priority, search.  (Default = 0, which
00215 //                                                              means standard search.)
00216 //
00217 // Options affection general program behavior:
00218 // -------------------------------------------
00219 //              stats <string>                  Level of statistics output
00220 //                                                                      silent           = no output,
00221 //                                                                      exec_time       += execution time only
00222 //                                                                      prep_stats      += preprocessing statistics
00223 //                                                                      query_stats += query performance stats
00224 //                                                                      query_res       += results of queries
00225 //                                                                      show_pts        += show the data points
00226 //                                                                      show_struct += print search structure
00227 //              validate <string>               Validate experiment and compute average
00228 //                                                              error.  Since validation causes exact
00229 //                                                              nearest neighbors to be computed by the
00230 //                                                              brute force method, this can take a long
00231 //                                                              time.  Valid arguments are:
00232 //                                                                      on                      = turn validation on
00233 //                                                                      off                     = turn validation off
00234 //              true_near_neigh <int>   Number of true nearest neighbors to compute.
00235 //                                                              When validating, we compute the difference
00236 //                                                              in rank between each reported nearest neighbor
00237 //                                                              and the true nearest neighbor of the same
00238 //                                                              rank.  Thus it is necessary to compute a
00239 //                                                              few more true nearest neighbors.  By default
00240 //                                                              we compute 10 more than near_neigh.  With
00241 //                                                              this option the exact number can be set.
00242 //                                                              (Used only when validating.)
00243 //
00244 // Example:
00245 // --------
00246 //      output_label test_run_0         # output label for this run
00247 //        validate off                          # do not perform validation
00248 //        dim 16                                        # points in dimension 16
00249 //        stats query_stats                     # output performance statistics for queries
00250 //        seed 121212                           # random number seed
00251 //        data_size 1000
00252 //        distribution uniform
00253 //      gen_data_pts                            # 1000 uniform data points in dim 16
00254 //        query_size 100
00255 //        std_dev 0.05
00256 //        distribution clus_gauss
00257 //      gen_query_pts                           # 100 points in 10 clusters with std_dev 0.05
00258 //        bucket_size 2
00259 //        split_rule kd
00260 //        shrink_rule none
00261 //      build_ann                                       # kd-tree, bucket size 2
00262 //        epsilon 0.1
00263 //        near_neigh 5
00264 //        max_pts_visit 100                     # stop search if more than 100 points seen
00265 //      run_queries standard            # run queries; 5 nearest neighbors, 10% error
00266 //        data_size 500
00267 //      read_data_pts data.in           # read up to 500 points from file data.in
00268 //        split_rule sl_midpt
00269 //        shrink_rule simple
00270 //      build_ann                                       # bd-tree; simple shrink, sliding midpoint split
00271 //        epsilon 0
00272 //      run_queries priority            # run same queries; 0 allowable error
00273 //
00274 //------------------------------------------------------------------------
00275 
00276 //------------------------------------------------------------------------
00277 //      Constants
00278 //------------------------------------------------------------------------
00279 
00280 const int               STRING_LEN              = 500;                  // max string length
00281 const double    ERR                             = 0.00001;              // epsilon (for float compares)
00282 
00283 //------------------------------------------------------------------------
00284 //      Enumerated values and conversions
00285 //------------------------------------------------------------------------
00286 
00287 typedef enum {DATA, QUERY} PtType;              // point types
00288 
00289 //------------------------------------------------------------------------
00290 //      Statistics output levels
00291 //------------------------------------------------------------------------
00292 
00293 typedef enum {                                  // stat levels
00294                 SILENT,                                                 // no output
00295                 EXEC_TIME,                                              // just execution time
00296                 PREP_STATS,                                             // preprocessing info
00297                 QUERY_STATS,                                    // query performance
00298                 QUERY_RES,                                              // query results
00299                 SHOW_PTS,                                               // show data points
00300                 SHOW_STRUCT,                                    // show tree structure
00301                 N_STAT_LEVELS}                                  // number of levels
00302                 StatLev;
00303 
00304 const char stat_table[N_STAT_LEVELS][STRING_LEN] = {
00305                 "silent",                                               // SILENT
00306                 "exec_time",                                    // EXEC_TIME
00307                 "prep_stats",                                   // PREP_STATS
00308                 "query_stats",                                  // QUERY_STATS
00309                 "query_res",                                    // QUERY_RES
00310                 "show_pts",                                             // SHOW_PTS
00311                 "show_struct"};                                 // SHOW_STRUCT
00312 
00313 //------------------------------------------------------------------------
00314 //      Distributions
00315 //------------------------------------------------------------------------
00316 
00317 typedef enum {                                  // distributions
00318                 UNIFORM,                                                // uniform over cube [-1,1]^d.
00319                 GAUSS,                                                  // Gaussian with mean 0
00320                 LAPLACE,                                                // Laplacian, mean 0 and var 1
00321                 CO_GAUSS,                                               // correlated Gaussian
00322                 CO_LAPLACE,                                             // correlated Laplacian
00323                 CLUS_GAUSS,                                             // clustered Gaussian
00324                 CLUS_ORTH_FLATS,                                // clustered on orthog flats
00325                 CLUS_ELLIPSOIDS,                                // clustered on ellipsoids
00326                 PLANTED,                                                // planted distribution
00327                 N_DISTRIBS}
00328                 Distrib;
00329 
00330 const char distr_table[N_DISTRIBS][STRING_LEN] = {
00331                 "uniform",                                              // UNIFORM
00332                 "gauss",                                                // GAUSS
00333                 "laplace",                                              // LAPLACE
00334                 "co_gauss",                                             // CO_GAUSS
00335                 "co_laplace",                                   // CO_LAPLACE
00336                 "clus_gauss",                                   // CLUS_GAUSS
00337                 "clus_orth_flats",                              // CLUS_ORTH_FLATS
00338                 "clus_ellipsoids",                              // CLUS_ELLIPSOIS
00339                 "planted"};                                             // PLANTED
00340 
00341 //------------------------------------------------------------------------
00342 //      Splitting rules for kd-trees (see ANN.h for types)
00343 //------------------------------------------------------------------------
00344 
00345 const int N_SPLIT_RULES = 6;
00346 const char split_table[N_SPLIT_RULES][STRING_LEN] = {
00347                 "standard",                                             // standard optimized kd-tree
00348                 "midpt",                                                // midpoint split
00349                 "fair",                                                 // fair split
00350                 "sl_midpt",                                             // sliding midpt split
00351                 "sl_fair",                                              // sliding fair split
00352                 "suggest"};                                             // authors' choice for best
00353 
00354 //------------------------------------------------------------------------
00355 //      Shrinking rules for bd-trees (see ANN.h for types)
00356 //------------------------------------------------------------------------
00357 
00358 const int N_SHRINK_RULES = 4;
00359 const char shrink_table[N_SHRINK_RULES][STRING_LEN] = {
00360                 "none",                                                 // perform no shrinking (kd-tree)
00361                 "simple",                                               // simple shrinking
00362                 "centroid",                                             // centroid shrinking
00363                 "suggest"};                                             // authors' choice for best
00364 
00365 //----------------------------------------------------------------------
00366 //      Short utility functions
00367 //              Error - general error routine
00368 //              printPoint - print a point to standard output
00369 //              lookUp - look up a name in table and return index
00370 //----------------------------------------------------------------------
00371 
00372 void Error(                                                             // error routine
00373         char                            *msg,                   // error message
00374         ANNerr                          level)                  // abort afterwards
00375 {
00376         if (level == ANNabort) {
00377                 cerr << "ann_test: ERROR------->" << msg << "<-------------ERROR\n";
00378                 exit(1);
00379         }
00380         else {
00381                 cerr << "ann_test: WARNING----->" << msg << "<-------------WARNING\n";
00382         }
00383 }
00384 
00385 void printPoint(                                                // print point
00386         ANNpoint                        p,                              // the point
00387         int                                     dim)                    // the dimension
00388 {
00389         cout << "[";
00390         for (int i = 0; i < dim; i++) {
00391                 cout << p[i];
00392                 if (i < dim-1) cout << ",";
00393         }
00394         cout << "]";
00395 }
00396 
00397 int lookUp(                                                             // look up name in table
00398         const char      *arg,                                   // name to look up
00399         const char      (*table)[STRING_LEN],   // name table
00400         int                     size)                                   // table size
00401 {
00402         int i;
00403         for (i = 0; i < size; i++) {
00404                 if (!strcmp(arg, table[i])) return i;
00405         }
00406         return i;
00407 }
00408 
00409 //------------------------------------------------------------------------
00410 // Function declarations
00411 //------------------------------------------------------------------------
00412 
00413 void generatePts(                                               // generate data/query points
00414         ANNpointArray           &pa,                    // point array (returned)
00415         int                                     n,                              // number of points
00416         PtType                          type,                   // point type
00417         ANNbool                         new_clust,              // new cluster centers desired?
00418         ANNpointArray           src = NULL,             // source array (for PLANTED)
00419         int                                     n_src = 0);             // source size (for PLANTED)
00420 
00421 void readPts(                                                   // read data/query points from file
00422         ANNpointArray           &pa,                    // point array (returned)
00423         int                                     &n,                             // number of points
00424         char                            *file_nm,               // file name
00425         PtType                          type);                  // point type (DATA, QUERY)
00426 
00427 void doValidation();                                    // perform validation
00428 void getTrueNN();                                               // compute true nearest neighbors
00429 
00430 void treeStats(                                                 // print statistics on kd- or bd-tree
00431         ostream                         &out,                   // output stream
00432         ANNbool                         verbose);               // print stats
00433 
00434 //------------------------------------------------------------------------
00435 //      Default execution parameters
00436 //------------------------------------------------------------------------
00437 const int               extra_nn                = 10;                   // how many extra true nn's?
00438 
00439 const int               def_dim                 = 2;                    // def dimension
00440 const int               def_data_size   = 100;                  // def data size
00441 const int               def_query_size  = 100;                  // def number of queries
00442 const int               def_n_color             = 5;                    // def number of colors
00443 const ANNbool   def_new_clust   = ANNfalse;             // def new clusters flag
00444 const int               def_max_dim             = 1;                    // def max flat dimension
00445 const Distrib   def_distr               = UNIFORM;              // def distribution
00446 const double    def_std_dev             = 1.00;                 // def standard deviation
00447 const double    def_corr_coef   = 0.05;                 // def correlation coef
00448 const int               def_bucket_size = 1;                    // def bucket size
00449 const double    def_epsilon             = 0.0;                  // def error bound
00450 const int               def_near_neigh  = 1;                    // def number of near neighbors
00451 const int               def_max_visit   = 0;                    // def number of points visited
00452 const int               def_rad_bound   = 0;                    // def radius bound
00453                                                                                                 // def number of true nn's
00454 const int               def_true_nn             = def_near_neigh + extra_nn;
00455 const int               def_seed                = 0;                    // def seed for random numbers
00456 const ANNbool   def_validate    = ANNfalse;             // def validation flag
00457                                                                                                 // def statistics output level
00458 const StatLev   def_stats               = QUERY_STATS;
00459 const ANNsplitRule                                                              // def splitting rule
00460                                 def_split               = ANN_KD_SUGGEST;
00461 const ANNshrinkRule                                                             // def shrinking rule
00462                                 def_shrink              = ANN_BD_NONE;
00463 
00464 //------------------------------------------------------------------------
00465 //      Global variables - Execution options
00466 //------------------------------------------------------------------------
00467 
00468 int                             dim;                                    // dimension
00469 int                             data_size;                              // data size
00470 int                             query_size;                             // number of queries
00471 int                             n_color;                                // number of colors
00472 ANNbool                 new_clust;                              // generate new clusters?
00473 int                             max_dim;                                // maximum flat dimension
00474 Distrib                 distr;                                  // distribution
00475 double                  corr_coef;                              // correlation coef
00476 double                  std_dev;                                // standard deviation
00477 double                  std_dev_lo;                             // low standard deviation
00478 double                  std_dev_hi;                             // high standard deviation
00479 int                             bucket_size;                    // bucket size
00480 double                  epsilon;                                // error bound
00481 int                             near_neigh;                             // number of near neighbors
00482 int                             max_pts_visit;                  // max number of points to visit
00483 double                  radius_bound;                   // maximum radius search bound
00484 int                             true_nn;                                // number of true nn's
00485 ANNbool                 validate;                               // validation flag
00486 StatLev                 stats;                                  // statistics output level
00487 ANNsplitRule    split;                                  // splitting rule
00488 ANNshrinkRule   shrink;                                 // shrinking rule
00489 
00490 //------------------------------------------------------------------------
00491 //      More globals - pointers to dynamically allocated arrays and structures
00492 //
00493 //              It is assumed that all these values are set to NULL when nothing
00494 //              is allocated.
00495 //
00496 //              data_pts, query_pts                             The data and query points
00497 //              the_tree                                                Points to the kd- or bd-tree for
00498 //                                                                              nearest neighbor searching.
00499 //              apx_nn_idx, apx_dists                   Record approximate near neighbor
00500 //                                                                              indices and distances
00501 //              apx_pts_in_range                                Counts of the number of points in
00502 //                                                                              the in approx range, for fixed-
00503 //                                                                              radius NN searching.
00504 //              true_nn_idx, true_dists                 Record true near neighbor
00505 //                                                                              indices and distances
00506 //              min_pts_in_range, max_...               Min and max counts of the number
00507 //                                                                              of points in the in approximate
00508 //                                                                              range.
00509 //              valid_dirty                                             To avoid repeated validation,
00510 //                                                                              we only validate query results
00511 //                                                                              once.  This validation becomes
00512 //                                                                              invalid, if a new tree, new data
00513 //                                                                              points or new query points have
00514 //                                                                              been generated.
00515 //              tree_data_size                                  The number of points in the
00516 //                                                                              current tree.  (This will be the
00517 //                                                                              same a data_size unless points have
00518 //                                                                              been added since the tree was
00519 //                                                                              built.)
00520 //
00521 //              The approximate and true nearest neighbor results are stored
00522 //              in: apx_nn_idx, apx_dists, and true_nn_idx, true_dists.
00523 //              They are really flattened 2-dimensional arrays. Each of these
00524 //              arrays consists of query_size blocks, each of which contains
00525 //              near_neigh (or true_nn) entries, one for each of the nearest
00526 //              neighbors for a given query point.
00527 //------------------------------------------------------------------------
00528 
00529 ANNpointArray   data_pts;                               // data points
00530 ANNpointArray   query_pts;                              // query points
00531 ANNbd_tree*             the_tree;                               // kd- or bd-tree search structure
00532 ANNidxArray             apx_nn_idx;                             // storage for near neighbor indices
00533 ANNdistArray    apx_dists;                              // storage for near neighbor distances
00534 int*                    apx_pts_in_range;               // storage for no. of points in range
00535 ANNidxArray             true_nn_idx;                    // true near neighbor indices
00536 ANNdistArray    true_dists;                             // true near neighbor distances
00537 int*                    min_pts_in_range;               // min points in approx range
00538 int*                    max_pts_in_range;               // max points in approx range
00539 
00540 ANNbool                 valid_dirty;                    // validation is no longer valid
00541 
00542 //------------------------------------------------------------------------
00543 //      Initialize global parameters
00544 //------------------------------------------------------------------------
00545 
00546 void initGlobals()
00547 {
00548         dim                                     = def_dim;                              // init execution parameters
00549         data_size                       = def_data_size;
00550         query_size                      = def_query_size;
00551         distr                           = def_distr;
00552         corr_coef                       = def_corr_coef;
00553         std_dev                         = def_std_dev;
00554         std_dev_lo                      = def_std_dev;
00555         std_dev_hi                      = def_std_dev;
00556         new_clust                       = def_new_clust;
00557         max_dim                         = def_max_dim;
00558         n_color                         = def_n_color;
00559         bucket_size                     = def_bucket_size;
00560         epsilon                         = def_epsilon;
00561         near_neigh                      = def_near_neigh;
00562         max_pts_visit           = def_max_visit;
00563         radius_bound            = def_rad_bound;
00564         true_nn                         = def_true_nn;
00565         validate                        = def_validate;
00566         stats                           = def_stats;
00567         split                           = def_split;
00568         shrink                          = def_shrink;
00569         annIdum                         = -def_seed;                    // init. global seed for ran0()
00570 
00571         data_pts                        = NULL;                                 // initialize storage pointers
00572         query_pts                       = NULL;
00573         the_tree                        = NULL;
00574         apx_nn_idx                      = NULL;
00575         apx_dists                       = NULL;
00576         apx_pts_in_range        = NULL;
00577         true_nn_idx             = NULL;
00578         true_dists                      = NULL;
00579         min_pts_in_range        = NULL;
00580         max_pts_in_range        = NULL;
00581 
00582         valid_dirty                     = ANNtrue;                              // (validation must be done)
00583 }
00584 
00585 //------------------------------------------------------------------------
00586 // getDirective - skip comments and read next directive
00587 //      Returns ANNtrue if directive read, and ANNfalse if eof seen.
00588 //------------------------------------------------------------------------
00589 
00590 ANNbool skipComment(                            // skip any comments
00591     istream             &in)                            // input stream
00592 {
00593     char ch = 0;
00594                                                 // skip whitespace
00595     do { in.get(ch); } while (isspace(ch) && !in.eof());
00596     while (ch == '#' && !in.eof()) {            // comment?
00597                                                 // skip to end of line
00598         do { in.get(ch); } while(ch != '\n' && !in.eof());
00599                                                 // skip whitespace
00600         do { in.get(ch); } while(isspace(ch) && !in.eof());
00601     }
00602     if (in.eof()) return ANNfalse;                      // end of file
00603     in.putback(ch);                             // put character back
00604     return ANNtrue;
00605 }
00606 
00607 ANNbool getDirective(
00608     istream             &in,                    // input stream
00609     char                *directive)             // directive storage
00610 {
00611     if (!skipComment(in))                       // skip comments
00612         return ANNfalse;                        // found eof along the way?
00613     in >> directive;                            // read directive
00614     return ANNtrue;
00615 }
00616 
00617 
00618 //------------------------------------------------------------------------
00619 // main program - driver
00620 //              The main program reads input options, invokes the necessary
00621 //              routines to process them.
00622 //------------------------------------------------------------------------
00623 
00624 int main(int argc, char** argv)
00625 {
00626         long            clock0;                                         // clock time
00627         char            directive[STRING_LEN];          // input directive
00628         char            arg[STRING_LEN];                        // all-purpose argument
00629 
00630         cout << "------------------------------------------------------------\n"
00631                  << "ann_test: Version " << ANNversion << " " << ANNversionCmt << "\n"
00632                  << "    Copyright: " << ANNcopyright << ".\n"
00633                  << "    Latest Revision: " << ANNlatestRev << ".\n"
00634                  << "------------------------------------------------------------\n\n";
00635 
00636         initGlobals();                                                          // initialize global values
00637 
00638         //--------------------------------------------------------------------
00639         //      Main input loop
00640         //--------------------------------------------------------------------
00641                                                                                                 // read input directive
00642         while (getDirective(cin, directive)) {
00643                 //----------------------------------------------------------------
00644                 //      Read options
00645                 //----------------------------------------------------------------
00646                 if (!strcmp(directive,"dim")) {
00647                         cin >> dim;
00648                 }
00649                 else if (!strcmp(directive,"colors")) {
00650                         cin >> n_color;
00651                 }
00652                 else if (!strcmp(directive,"new_clust")) {
00653                         new_clust = ANNtrue;
00654                 }
00655                 else if (!strcmp(directive,"max_clus_dim")) {
00656                         cin >> max_dim;
00657                 }
00658                 else if (!strcmp(directive,"std_dev")) {
00659                         cin >> std_dev;
00660                 }
00661                 else if (!strcmp(directive,"std_dev_lo")) {
00662                         cin >> std_dev_lo;
00663                 }
00664                 else if (!strcmp(directive,"std_dev_hi")) {
00665                         cin >> std_dev_hi;
00666                 }
00667                 else if (!strcmp(directive,"corr_coef")) {
00668                         cin >> corr_coef;
00669                 }
00670                 else if (!strcmp(directive, "data_size")) {
00671                         cin >> data_size;
00672                 }
00673                 else if (!strcmp(directive,"query_size")) {
00674                         cin >> query_size;
00675                 }
00676                 else if (!strcmp(directive,"bucket_size")) {
00677                         cin >> bucket_size;
00678                 }
00679                 else if (!strcmp(directive,"epsilon")) {
00680                         cin >> epsilon;
00681                 }
00682                 else if (!strcmp(directive,"max_pts_visit")) {
00683                         cin >> max_pts_visit;
00684                         valid_dirty = ANNtrue;                          // validation must be redone
00685                 }
00686                 else if (!strcmp(directive,"radius_bound")) {
00687                         cin >> radius_bound;
00688                         valid_dirty = ANNtrue;                          // validation must be redone
00689                 }
00690                 else if (!strcmp(directive,"near_neigh")) {
00691                         cin >> near_neigh;
00692                         true_nn = near_neigh + extra_nn;        // also reset true near neighs
00693                         valid_dirty = ANNtrue;                          // validation must be redone
00694                 }
00695                 else if (!strcmp(directive,"true_near_neigh")) {
00696                         cin >> true_nn;
00697                         valid_dirty = ANNtrue;                          // validation must be redone
00698                 }
00699                 //----------------------------------------------------------------
00700                 //      seed option
00701                 //              The seed is reset by setting the global annIdum to the
00702                 //              negation of the seed value.  See rand.cpp.
00703                 //----------------------------------------------------------------
00704                 else if (!strcmp(directive,"seed")) {
00705                         cin >> annIdum;
00706                         annIdum = -annIdum;
00707                 }
00708                 //----------------------------------------------------------------
00709                 //      validate option
00710                 //----------------------------------------------------------------
00711                 else if (!strcmp(directive,"validate")) {
00712                         cin >> arg;                                                     // input argument
00713                         if (!strcmp(arg, "on")) {
00714                                 validate = ANNtrue;
00715                                 cout << "validate = on   "
00716                                          << "(Warning: this may slow execution time.)\n";
00717                         }
00718                         else if (!strcmp(arg, "off")) {
00719                                 validate = ANNfalse;
00720                         }
00721                         else {
00722                                 cerr << "Argument: " << arg << "\n";
00723                                 Error("validate argument must be \"on\" or \"off\"", ANNabort);
00724                         }
00725                 }
00726                 //----------------------------------------------------------------
00727                 //      distribution option
00728                 //----------------------------------------------------------------
00729                 else if (!strcmp(directive,"distribution")) {
00730                         cin >> arg;                                                     // input name and translate
00731                         distr = (Distrib) lookUp(arg, distr_table, N_DISTRIBS);
00732                         if (distr >= N_DISTRIBS) {                      // not something we recognize
00733                                 cerr << "Distribution: " << arg << "\n";
00734                                 Error("Unknown distribution", ANNabort);
00735                         }
00736                 }
00737                 //----------------------------------------------------------------
00738                 //      stats option
00739                 //----------------------------------------------------------------
00740                 else if (!strcmp(directive,"stats")) {
00741                         cin >> arg;                                                     // input name and translate
00742                         stats = (StatLev) lookUp(arg, stat_table, N_STAT_LEVELS);
00743                         if (stats >= N_STAT_LEVELS) {           // not something we recognize
00744                                 cerr << "Stats level: " << arg << "\n";
00745                                 Error("Unknown statistics level", ANNabort);
00746                         }
00747                         if (stats > SILENT)
00748                                 cout << "stats = " << arg << "\n";
00749                 }
00750                 //----------------------------------------------------------------
00751                 //      split_rule option
00752                 //----------------------------------------------------------------
00753                 else if (!strcmp(directive,"split_rule")) {
00754                         cin >> arg;                                                     // input split_rule name
00755                         split = (ANNsplitRule) lookUp(arg, split_table, N_SPLIT_RULES);
00756                         if (split >= N_SPLIT_RULES) {           // not something we recognize
00757                                 cerr << "Splitting rule: " << arg << "\n";
00758                                 Error("Unknown splitting rule", ANNabort);
00759                         }
00760                 }
00761                 //----------------------------------------------------------------
00762                 //      shrink_rule option
00763                 //----------------------------------------------------------------
00764                 else if (!strcmp(directive,"shrink_rule")) {
00765                         cin >> arg;                                                     // input split_rule name
00766                         shrink = (ANNshrinkRule) lookUp(arg, shrink_table, N_SHRINK_RULES);
00767                         if (shrink >= N_SHRINK_RULES) {         // not something we recognize
00768                                 cerr << "Shrinking rule: " << arg << "\n";
00769                                 Error("Unknown shrinking rule", ANNabort);
00770                         }
00771                 }
00772                 //----------------------------------------------------------------
00773                 //      label operation
00774                 //----------------------------------------------------------------
00775                 else if (!strcmp(directive,"output_label")) {
00776                         cin >> arg;
00777                         if (stats > SILENT)
00778                                 cout << "<" << arg << ">\n";
00779                 }
00780                 //----------------------------------------------------------------
00781                 //      gen_data_pts operation
00782                 //----------------------------------------------------------------
00783                 else if (!strcmp(directive,"gen_data_pts")) {
00784                         if (distr == PLANTED) {                         // planted distribution
00785                                 Error("Cannot use planted distribution for data points", ANNabort);
00786                         }
00787                         generatePts(                                            // generate data points
00788                                 data_pts,                                               // data points
00789                                 data_size,                                              // data size
00790                                 DATA,                                                   // data points
00791                                 new_clust);                                             // new clusters flag
00792                         valid_dirty = ANNtrue;                          // validation must be redone
00793                         new_clust = ANNfalse;                           // reset flag
00794                 }
00795                 //----------------------------------------------------------------
00796                 //      gen_query_pts operation
00797                 //              If the distribution is PLANTED, then the query points
00798                 //              are planted near the data points (which must already be
00799                 //              generated).
00800                 //----------------------------------------------------------------
00801                 else if (!strcmp(directive,"gen_query_pts")) {
00802                         if (distr == PLANTED) {                         // planted distribution
00803                                 if (data_pts == NULL) {
00804                                         Error("Must generate data points before query points for planted distribution", ANNabort);
00805                                 }
00806                                 generatePts(                                    // generate query points
00807                                         query_pts,                                      // point array
00808                                         query_size,                                     // number of query points
00809                                         QUERY,                                          // query points
00810                                         new_clust,                                      // new clusters flag
00811                                         data_pts,                                       // plant around data pts
00812                                         data_size);
00813                         }
00814                         else {                                                          // all other distributions
00815                                 generatePts(                                    // generate query points
00816                                         query_pts,                                      // point array
00817                                         query_size,                                     // number of query points
00818                                         QUERY,                                          // query points
00819                                         new_clust);                                     // new clusters flag
00820                         }
00821                         valid_dirty = ANNtrue;                          // validation must be redone
00822                         new_clust = ANNfalse;                           // reset flag
00823                 }
00824                 //----------------------------------------------------------------
00825                 //      read_data_pts operation
00826                 //----------------------------------------------------------------
00827                 else if (!strcmp(directive,"read_data_pts")) {
00828                         cin >> arg;                                                     // input file name
00829                         readPts(
00830                                 data_pts,                                               // point array
00831                                 data_size,                                              // number of points
00832                                 arg,                                                    // file name
00833                                 DATA);                                                  // data points
00834                         valid_dirty = ANNtrue;                          // validation must be redone
00835                 }
00836                 //----------------------------------------------------------------
00837                 //      read_query_pts operation
00838                 //----------------------------------------------------------------
00839                 else if (!strcmp(directive,"read_query_pts")) {
00840                         cin >> arg;                                                     // input file name
00841                         readPts(
00842                                 query_pts,                                              // point array
00843                                 query_size,                                             // number of points
00844                                 arg,                                                    // file name
00845                                 QUERY);                                                 // query points
00846                         valid_dirty = ANNtrue;                          // validation must be redone
00847                 }
00848                 //----------------------------------------------------------------
00849                 //      build_ann operation
00850                 //              We always invoke the constructor for bd-trees.  Note
00851                 //              that when the shrinking rule is NONE (which is true by
00852                 //              default), then this constructs a kd-tree.
00853                 //----------------------------------------------------------------
00854                 else if (!strcmp(directive,"build_ann")) {
00855                         //------------------------------------------------------------
00856                         //      Build the tree
00857                         //------------------------------------------------------------
00858                         if (the_tree != NULL) {                         // tree exists already
00859                                 delete the_tree;                                // get rid of it
00860                         }
00861                         clock0 = clock();                                       // start time
00862 
00863                         the_tree = new ANNbd_tree(                      // build it
00864                                         data_pts,                                       // the data points
00865                                         data_size,                                      // number of points
00866                                         dim,                                            // dimension of space
00867                                         bucket_size,                            // maximum bucket size
00868                                         split,                                          // splitting rule
00869                                         shrink);                                        // shrinking rule
00870 
00871                         //------------------------------------------------------------
00872                         //      Print summary
00873                         //------------------------------------------------------------
00874                         long prep_time = clock() - clock0;      // end of prep time
00875 
00876                         if (stats > SILENT) {
00877                                 cout << "[Build ann-structure:\n";
00878                                 cout << "  split_rule    = " << split_table[split] << "\n";
00879                                 cout << "  shrink_rule   = " << shrink_table[shrink] << "\n";
00880                                 cout << "  data_size     = " << data_size << "\n";
00881                                 cout << "  dim           = " << dim << "\n";
00882                                 cout << "  bucket_size   = " << bucket_size << "\n";
00883 
00884                                 if (stats >= EXEC_TIME) {               // output processing time
00885                                         cout << "  process_time  = "
00886                                                  << double(prep_time)/CLOCKS_PER_SEC << " sec\n";
00887                                 }
00888 
00889                                 if (stats >= PREP_STATS)                // output or check tree stats
00890                                         treeStats(cout, ANNtrue);       // print tree stats
00891                                 else
00892                                         treeStats(cout, ANNfalse);      // check stats
00893 
00894                                 if (stats >= SHOW_STRUCT) {             // print the whole tree
00895                                         cout << "  (Structure Contents:\n";
00896                                         the_tree->Print(ANNfalse, cout);
00897                                         cout << "  )\n";
00898                                 }
00899                                 cout << "]\n";
00900                         }
00901                 }
00902                 //----------------------------------------------------------------
00903                 //      dump operation
00904                 //----------------------------------------------------------------
00905                 else if (!strcmp(directive,"dump")) {
00906                         cin >> arg;                                                     // input file name
00907                         if (the_tree == NULL) {                         // no tree
00908                                 Error("Cannot dump.  No tree has been built yet", ANNwarn);
00909                         }
00910                         else {                                                          // there is a tree
00911                                                                                                 // try to open file
00912                                 ofstream out_dump_file(arg);
00913                                 if (!out_dump_file) {
00914                                         cerr << "File name: " << arg << "\n";
00915                                         Error("Cannot open dump file", ANNabort);
00916                                 }
00917                                                                                                 // dump the tree and points
00918                                 the_tree->Dump(ANNtrue, out_dump_file);
00919                                 if (stats > SILENT) {
00920                                         cout << "(Tree has been dumped to file " << arg << ")\n";
00921                                 }
00922                         }
00923                 }
00924                 //----------------------------------------------------------------
00925                 //      load operation
00926                 //              Since this not only loads a tree, but loads a new set
00927                 //              of data points.
00928                 //----------------------------------------------------------------
00929                 else if (!strcmp(directive,"load")) {
00930                         cin >> arg;                                                     // input file name
00931                         if (the_tree != NULL) {                         // tree exists already
00932                                 delete the_tree;                                // get rid of it
00933                         }
00934                         if (data_pts != NULL) {                         // data points exist already
00935                                 delete data_pts;                                // get rid of them
00936                         }
00937 
00938                         ifstream in_dump_file(arg);                     // try to open file
00939                         if (!in_dump_file) {
00940                                 cerr << "File name: " << arg << "\n";
00941                                 Error("Cannot open file for loading", ANNabort);
00942                         }
00943                                                                                                 // build tree by loading
00944                         the_tree = new ANNbd_tree(in_dump_file);
00945 
00946                         dim = the_tree->theDim();                       // new dimension
00947                         data_size = the_tree->nPoints();        // number of points
00948                         data_pts = the_tree->thePoints();       // new points
00949 
00950                         valid_dirty = ANNtrue;                          // validation must be redone
00951 
00952                         if (stats > SILENT) {
00953                                         cout << "(Tree has been loaded from file " << arg << ")\n";
00954                         }
00955                         if (stats >= SHOW_STRUCT) {                     // print the tree
00956                                 cout << "  (Structure Contents:\n";
00957                                 the_tree->Print(ANNfalse, cout);
00958                                 cout << "  )\n";
00959                         }
00960                 }
00961                 //----------------------------------------------------------------
00962                 //      run_queries operation
00963                 //              This section does all the query processing.  It consists
00964                 //              of the following subsections:
00965                 //
00966                 //              **      input the argument (standard or priority) and output
00967                 //                      the header describing the essential information.
00968                 //              **      allocate space for the results to be stored.
00969                 //              **      run the queries by invoking the appropriate search
00970                 //                      procedure on the query points.  Print nearest neighbor
00971                 //                      if requested.
00972                 //              **      print final summaries
00973                 //
00974                 //      The approach for processing multiple nearest neighbors is
00975                 //      pretty crude.  We allocate an array whose size is the
00976                 //      product of the total number of queries times the number of
00977                 //      nearest neighbors (k), and then use each k consecutive
00978                 //      entries to store the results of each query.
00979                 //----------------------------------------------------------------
00980                 else if (!strcmp(directive,"run_queries")) {
00981 
00982                         //------------------------------------------------------------
00983                         //      Input arguments and print summary
00984                         //------------------------------------------------------------
00985                         enum {STANDARD, PRIORITY} method;
00986 
00987                         cin >> arg;                                                     // input argument
00988                         if (!strcmp(arg, "standard")) {
00989                                 method = STANDARD;
00990                         }
00991                         else if (!strcmp(arg, "priority")) {
00992                                 method = PRIORITY;
00993                         }
00994                         else {
00995                                 cerr << "Search type: " << arg << "\n";
00996                                 Error("Search type must be \"standard\" or \"priority\"",
00997                                                 ANNabort);
00998                         }
00999                         if (data_pts == NULL || query_pts == NULL) {
01000                                 Error("Either data set and query set not constructed", ANNabort);
01001                         }
01002                         if (the_tree == NULL) {
01003                                 Error("No search tree built.", ANNabort);
01004                         }
01005 
01006                         //------------------------------------------------------------
01007                         //      Set up everything
01008                         //------------------------------------------------------------
01009 
01010                         #ifdef ANN_PERF                                         // performance only
01011                                 annResetStats(data_size);                       // reset statistics
01012                         #endif
01013 
01014                         clock0 = clock();                                       // start time
01015                                                                                                 // deallocate existing storage
01016                         if (apx_nn_idx           != NULL) delete [] apx_nn_idx;
01017                         if (apx_dists            != NULL) delete [] apx_dists;
01018                         if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
01019                                                                                                 // allocate apx answer storage
01020                         apx_nn_idx = new ANNidx[near_neigh*query_size];
01021                         apx_dists  = new ANNdist[near_neigh*query_size];
01022                         apx_pts_in_range = new int[query_size];
01023 
01024                         annMaxPtsVisit(max_pts_visit);          // set max points to visit
01025 
01026                         //------------------------------------------------------------
01027                         //      Run the queries
01028                         //------------------------------------------------------------
01029                                                                                                 // pointers for current query
01030                         ANNidxArray       curr_nn_idx = apx_nn_idx;
01031                         ANNdistArray  curr_dists  = apx_dists;
01032 
01033                         for (int i = 0; i < query_size; i++) {
01034                                 #ifdef ANN_PERF
01035                                         annResetCounts();                       // reset counters
01036                                 #endif
01037                                 apx_pts_in_range[i] = 0;
01038 
01039                                 if (radius_bound == 0) {                // no radius bound
01040                                         if (method == STANDARD) {
01041                                                 the_tree->annkSearch(
01042                                                         query_pts[i],           // query point
01043                                                         near_neigh,                     // number of near neighbors
01044                                                         curr_nn_idx,            // nearest neighbors (returned)
01045                                                         curr_dists,                     // distance (returned)
01046                                                         epsilon);                       // error bound
01047                                         }
01048                                         else if (method == PRIORITY) {
01049                                                 the_tree->annkPriSearch(
01050                                                         query_pts[i],           // query point
01051                                                         near_neigh,                     // number of near neighbors
01052                                                         curr_nn_idx,            // nearest neighbors (returned)
01053                                                         curr_dists,                     // distance (returned)
01054                                                         epsilon);                       // error bound
01055                                         }
01056                                         else {
01057                                                 Error("Internal error - invalid method", ANNabort);
01058                                         }
01059                                 }
01060                                 else {                                                  // use radius bound
01061                                         if (method != STANDARD) {
01062                                                 Error("A nonzero radius bound assumes standard search",
01063                                                         ANNwarn);
01064                                         }
01065                                         apx_pts_in_range[i] = the_tree->annkFRSearch(
01066                                                 query_pts[i],                   // query point
01067                                                 ANN_POW(radius_bound),  // squared radius search bound
01068                                                 near_neigh,                             // number of near neighbors
01069                                                 curr_nn_idx,                    // nearest neighbors (returned)
01070                                                 curr_dists,                             // distance (returned)
01071                                                 epsilon);                               // error bound
01072                                 }
01073                                 curr_nn_idx += near_neigh;              // increment current pointers
01074                                 curr_dists      += near_neigh;
01075 
01076                                 #ifdef ANN_PERF
01077                                         annUpdateStats();                       // update stats
01078                                 #endif
01079                         }
01080 
01081                         long query_time = clock() - clock0; // end of query time
01082 
01083                         if (validate) {                                         // validation requested
01084                                 if (valid_dirty) getTrueNN();   // get true near neighbors
01085                                 doValidation();                                 // validate
01086                         }
01087 
01088                         //------------------------------------------------------------
01089                         //      Print summaries
01090                         //------------------------------------------------------------
01091                 
01092                         if (stats > SILENT) {
01093                                 cout << "[Run Queries:\n";
01094                                 cout << "  query_size    = " << query_size << "\n";
01095                                 cout << "  dim           = " << dim << "\n";
01096                                 cout << "  search_method = " << arg << "\n";
01097                                 cout << "  epsilon       = " << epsilon << "\n";
01098                                 cout << "  near_neigh    = " << near_neigh << "\n";
01099                                 if (max_pts_visit != 0)
01100                                         cout << "  max_pts_visit = " << max_pts_visit << "\n";
01101                                 if (radius_bound != 0)
01102                                         cout << "  radius_bound  = " << radius_bound << "\n";
01103                                 if (validate)
01104                                         cout << "  true_nn       = " << true_nn << "\n";
01105 
01106                                 if (stats >= EXEC_TIME) {               // print exec time summary
01107                                         cout << "  query_time    = " <<
01108                                                 double(query_time)/(query_size*CLOCKS_PER_SEC)
01109                                                  << " sec/query";
01110                                         #ifdef ANN_PERF
01111                                                 cout << " (biased by perf measurements)";
01112                                         #endif
01113                                         cout << "\n";
01114                                 }
01115 
01116                                 if (stats >= QUERY_STATS) {             // output performance stats
01117                                         #ifdef ANN_PERF
01118                                                 cout.flush();
01119                                                 annPrintStats(validate);
01120                                         #else
01121                                                 cout << "  (Performance statistics unavailable.)\n";
01122                                         #endif
01123                                 }
01124 
01125                                 if (stats >= QUERY_RES) {               // output results
01126                                         cout << "  (Query Results:\n";
01127                                         cout << "    Pt\tANN\tDist\n";
01128                                         curr_nn_idx = apx_nn_idx;       // subarray pointers
01129                                         curr_dists  = apx_dists;
01130                                                                                                 // output nearest neighbors
01131                                         for (int i = 0; i < query_size; i++) {
01132                                                 cout << " " << setw(4) << i;
01133                                                 for (int j = 0; j < near_neigh; j++) {
01134                                                                                                 // exit if no more neighbors
01135                                                         if (curr_nn_idx[j] == ANN_NULL_IDX) {
01136                                                                 cout << "\t[no other pts in radius bound]\n";
01137                                                                 break;
01138                                                         }
01139                                                         else {                          // output point info
01140                                                                 cout << "\t" << curr_nn_idx[j]
01141                                                                         << "\t" << ANN_ROOT(curr_dists[j])
01142                                                                         << "\n";
01143                                                         }
01144                                                 }
01145                                                                                                 // output range count
01146                                                 if (radius_bound != 0) {
01147                                                         cout << "    pts_in_radius_bound = "
01148                                                                 << apx_pts_in_range[i] << "\n";
01149                                                 }
01150                                                                                                 // increment subarray pointers
01151                                                 curr_nn_idx += near_neigh;
01152                                                 curr_dists  += near_neigh;
01153                                         }
01154                                         cout << "  )\n";
01155                                 }
01156                                 cout << "]\n";
01157                         }
01158                 }
01159                 //----------------------------------------------------------------
01160                 //      Unknown directive
01161                 //----------------------------------------------------------------
01162                 else {
01163                         cerr << "Directive: " << directive << "\n";
01164                         Error("Unknown directive", ANNabort);
01165                 }
01166         }
01167         //--------------------------------------------------------------------
01168         //      End of input loop (deallocate stuff that was allocated)
01169         //--------------------------------------------------------------------
01170         if (the_tree            != NULL) delete the_tree;
01171         if (data_pts            != NULL) annDeallocPts(data_pts);
01172         if (query_pts           != NULL) annDeallocPts(query_pts);
01173         if (apx_nn_idx          != NULL) delete [] apx_nn_idx;
01174         if (apx_dists           != NULL) delete [] apx_dists;
01175         if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
01176 
01177         annClose();                     // close ANN
01178 
01179         return EXIT_SUCCESS;
01180 }
01181 
01182 //------------------------------------------------------------------------
01183 // generatePts - call appropriate routine to generate points of a
01184 //              given distribution.
01185 //------------------------------------------------------------------------
01186 
01187 void generatePts(
01188         ANNpointArray           &pa,                    // point array (returned)
01189         int                                     n,                              // number of points to generate
01190         PtType                          type,                   // point type
01191         ANNbool                         new_clust,              // new cluster centers desired?
01192         ANNpointArray           src,                    // source array (if distr=PLANTED)
01193         int                                     n_src)                  // source size (if distr=PLANTED)
01194 {
01195         if (pa != NULL) annDeallocPts(pa);                      // get rid of any old points
01196         pa = annAllocPts(n, dim);                                       // allocate point storage
01197 
01198         switch (distr) {
01199                 case UNIFORM:                                                   // uniform over cube [-1,1]^d.
01200                         annUniformPts(pa, n, dim);
01201                         break;
01202                 case GAUSS:                                                             // Gaussian with mean 0
01203                         annGaussPts(pa, n, dim, std_dev);
01204                         break;
01205                 case LAPLACE:                                                   // Laplacian, mean 0 and var 1
01206                         annLaplacePts(pa, n, dim);
01207                         break;
01208                 case CO_GAUSS:                                                  // correlated Gaussian
01209                         annCoGaussPts(pa, n, dim, corr_coef);
01210                         break;
01211                 case CO_LAPLACE:                                                // correlated Laplacian
01212                         annCoLaplacePts(pa, n, dim, corr_coef);
01213                         break;
01214                 case CLUS_GAUSS:                                                // clustered Gaussian
01215                         annClusGaussPts(pa, n, dim, n_color, new_clust, std_dev);
01216                         break;
01217                 case CLUS_ORTH_FLATS:                                   // clustered on orthog flats
01218                         annClusOrthFlats(pa, n, dim, n_color, new_clust, std_dev, max_dim);
01219                         break;
01220                 case CLUS_ELLIPSOIDS:                                   // clustered ellipsoids
01221                         annClusEllipsoids(pa, n, dim, n_color, new_clust, std_dev,
01222                                                 std_dev_lo, std_dev_hi, max_dim);
01223                         break;
01224                 case PLANTED:                                                   // planted distribution
01225                         annPlanted(pa, n, dim, src, n_src, std_dev);
01226                         break;
01227                 default:
01228                         Error("INTERNAL ERROR: Unknown distribution", ANNabort);
01229                         break;
01230         }
01231 
01232         if (stats > SILENT) {
01233                 if(type == DATA) cout << "[Generating Data Points:\n";
01234                 else                     cout << "[Generating Query Points:\n";
01235                 cout << "  number        = " << n << "\n";
01236                 cout << "  dim           = " << dim << "\n";
01237                 cout << "  distribution  = " << distr_table[distr] << "\n";
01238                 if (annIdum < 0)
01239                         cout << "  seed          = " << annIdum << "\n";
01240                 if (distr == GAUSS || distr == CLUS_GAUSS
01241                  || distr == CLUS_ORTH_FLATS)
01242                         cout << "  std_dev       = " << std_dev << "\n";
01243                 if (distr == CLUS_ELLIPSOIDS) {
01244                         cout << "  std_dev       = " << std_dev << " (small) \n";
01245                         cout << "  std_dev_lo    = " << std_dev_lo << "\n";
01246                         cout << "  std_dev_hi    = " << std_dev_hi << "\n";
01247                 }
01248                 if (distr == CO_GAUSS || distr == CO_LAPLACE)
01249                         cout << "  corr_coef     = " << corr_coef << "\n";
01250                 if (distr == CLUS_GAUSS || distr == CLUS_ORTH_FLATS
01251                  || distr == CLUS_ELLIPSOIDS) {
01252                         cout << "  colors        = " << n_color << "\n";
01253                         if (new_clust)
01254                                 cout << "  (cluster centers regenerated)\n";
01255                 }
01256                 if (distr == CLUS_ORTH_FLATS || distr == CLUS_ELLIPSOIDS) {
01257                         cout << "  max_dim       = " << max_dim << "\n";
01258                 }
01259         }
01260                                                                                                 // want to see points?
01261         if ((type == DATA  && stats >= SHOW_PTS) ||
01262                 (type == QUERY && stats >= QUERY_RES)) {
01263                 if(type == DATA) cout << "(Data Points:\n";
01264                 else                     cout << "(Query Points:\n";
01265                 for (int i = 0; i < n; i++) {
01266                         cout << " " << setw(4) << i << "\t";
01267                         printPoint(pa[i], dim);
01268                         cout << "\n";
01269                 }
01270                 cout << "  )\n";
01271         }
01272         cout << "]\n";
01273 }
01274 
01275 //------------------------------------------------------------------------
01276 // readPts - read a collection of data or query points.
01277 //------------------------------------------------------------------------
01278 
01279 void readPts(
01280         ANNpointArray           &pa,                    // point array (returned)
01281         int                                     &n,                             // number of points
01282         char                            *file_nm,               // file name
01283         PtType                          type)                   // point type (DATA, QUERY)
01284 {
01285         int i;
01286         //--------------------------------------------------------------------
01287         //      Open input file and read points
01288         //--------------------------------------------------------------------
01289         ifstream in_file(file_nm);                                      // try to open data file
01290         if (!in_file) {
01291                 cerr << "File name: " << file_nm << "\n";
01292                 Error("Cannot open input data/query file", ANNabort);
01293         }
01294                                                                                                 // allocate storage for points
01295         if (pa != NULL) annDeallocPts(pa);                      // get rid of old points
01296         pa = annAllocPts(n, dim);
01297 
01298         for (i = 0; i < n; i++) {                                       // read the data
01299                 if (!(in_file >> pa[i][0])) break;
01300                 for (int d = 1; d < dim; d++) {
01301                         in_file >> pa[i][d];
01302                 }
01303         }
01304 
01305         char ignore_me;                                                         // character for EOF test
01306         in_file >> ignore_me;                                           // try to get one more character
01307         if (!in_file.eof()) {                                           // exhausted space before eof
01308                 if (type == DATA) 
01309                         Error("`data_size' too small. Input file truncated.", ANNwarn);
01310                 else
01311                         Error("`query_size' too small. Input file truncated.", ANNwarn);
01312         }
01313         n = i;                                                                          // number of points read
01314 
01315         //--------------------------------------------------------------------
01316         //      Print summary
01317         //--------------------------------------------------------------------
01318         if (stats > SILENT) {
01319                 if (type == DATA) {
01320                         cout << "[Read Data Points:\n";
01321                         cout << "  data_size  = " << n << "\n";
01322                 }
01323                 else {
01324                         cout << "[Read Query Points:\n";
01325                         cout << "  query_size = " << n << "\n";
01326                 }
01327                 cout << "  file_name  = " << file_nm << "\n";
01328                 cout << "  dim        = " << dim << "\n";
01329                                                                                                 // print if results requested
01330                 if ((type == DATA && stats >= SHOW_PTS) ||
01331                         (type == QUERY && stats >= QUERY_RES)) {
01332                         cout << "  (Points:\n";
01333                         for (i = 0; i < n; i++) {
01334                                 cout << "    " << i << "\t";
01335                                 printPoint(pa[i], dim);
01336                                 cout << "\n";
01337                         }
01338                         cout << "  )\n";
01339                 }
01340                 cout << "]\n";
01341         }
01342 }
01343 
01344 //------------------------------------------------------------------------
01345 //      getTrueNN
01346 //              Computes the true nearest neighbors.  For purposes of validation,
01347 //              this intentionally done in a rather dumb (but safe way), by
01348 //              invoking the brute-force search.
01349 //
01350 //              The number of true nearest neighbors is somewhat larger than
01351 //              the number of nearest neighbors.  This is so that the validation
01352 //              can determine the expected difference in element ranks.
01353 //
01354 //              This procedure is invoked just prior to running queries.  Since
01355 //              the operation takes a long time, it is performed only if needed.
01356 //              In particular, once generated, it will be regenerated only if
01357 //              new query or data points are generated, or if the requested number
01358 //              of true near neighbors or approximate near neighbors has changed.
01359 //
01360 //              To validate fixed-radius searching, we compute two counts, one
01361 //              with the original query radius (trueSqRadius) and the other with
01362 //              a radius shrunken by the error factor (minSqradius).  We then
01363 //              check that the count of points inside the approximate range is
01364 //              between these two bounds.  Because fixed-radius search is
01365 //              allowed to ignore points within the shrunken radius, we only
01366 //              compute exact neighbors within this smaller distance (for we
01367 //              cannot guarantee that we will even visit the other points).
01368 //------------------------------------------------------------------------
01369 
01370 void getTrueNN()                                                // compute true nearest neighbors
01371 {
01372         if (stats > SILENT) {
01373                 cout << "(Computing true nearest neighbors for validation.  This may take time.)\n";
01374         }
01375                                                                                                 // deallocate existing storage
01376         if (true_nn_idx                 != NULL) delete [] true_nn_idx;
01377         if (true_dists                  != NULL) delete [] true_dists;
01378         if (min_pts_in_range    != NULL) delete [] min_pts_in_range;
01379         if (max_pts_in_range    != NULL) delete [] max_pts_in_range;
01380 
01381         if (true_nn > data_size) {                                      // can't get more nn than points
01382                 true_nn = data_size;
01383         }
01384 
01385                                                                                                 // allocate true answer storage
01386         true_nn_idx = new ANNidx[true_nn*query_size];
01387         true_dists  = new ANNdist[true_nn*query_size];
01388         min_pts_in_range = new int[query_size];
01389         max_pts_in_range = new int[query_size];
01390 
01391         ANNidxArray  curr_nn_idx = true_nn_idx;         // current locations in arrays
01392         ANNdistArray curr_dists = true_dists;
01393 
01394                                                                                                 // allocate search structure
01395         ANNbruteForce *the_brute = new ANNbruteForce(data_pts, data_size, dim);
01396                                                                                                 // compute nearest neighbors
01397         for (int i = 0; i < query_size; i++) {
01398                 if (radius_bound == 0) {                                // standard kNN search
01399                         the_brute->annkSearch(                          // compute true near neighbors
01400                                                 query_pts[i],                   // query point
01401                                                 true_nn,                                // number of nearest neighbors
01402                                                 curr_nn_idx,                    // where to put indices
01403                                                 curr_dists);                    // where to put distances
01404                 }
01405                 else {                                                                  // fixed radius kNN search
01406                                                                                                 // search radii limits
01407                         ANNdist trueSqRadius = ANN_POW(radius_bound);
01408                         ANNdist minSqRadius = ANN_POW(radius_bound / (1+epsilon));
01409                         min_pts_in_range[i] = the_brute->annkFRSearch(
01410                                                 query_pts[i],                   // query point
01411                                                 minSqRadius,                    // shrunken search radius
01412                                                 true_nn,                                // number of near neighbors
01413                                                 curr_nn_idx,                    // nearest neighbors (returned)
01414                                                 curr_dists);                    // distance (returned)
01415                         max_pts_in_range[i] = the_brute->annkFRSearch(
01416                                                 query_pts[i],                   // query point
01417                                                 trueSqRadius,                   // true search radius
01418                                                 0, NULL, NULL);                 // (ignore kNN info)
01419                 }
01420                 curr_nn_idx += true_nn;                                 // increment nn index pointer
01421                 curr_dists  += true_nn;                                 // increment nn dist pointer
01422         }
01423         delete the_brute;                                                       // delete brute-force struct
01424         valid_dirty = ANNfalse;                                         // validation good for now
01425 }
01426 
01427 //------------------------------------------------------------------------
01428 //      doValidation
01429 //              Compares the approximate answers to the k-nearest neighbors
01430 //              against the true nearest neighbors (computed earlier). It is
01431 //              assumed that the true nearest neighbors and indices have been
01432 //              computed earlier.
01433 //
01434 //              First, we check that all the results are within their allowed
01435 //              limits, and generate an internal error, if not.  For the sake of
01436 //              performance evaluation, we also compute the following two
01437 //              quantities for nearest neighbors:
01438 //
01439 //              Average Error
01440 //              -------------
01441 //              The relative error between the distance to a reported nearest
01442 //              neighbor and the true nearest neighbor (of the same rank),
01443 //
01444 //              Rank Error
01445 //              ----------
01446 //              The difference in rank between the reported nearest neighbor and
01447 //              its position (if any) among the true nearest neighbors.  If we
01448 //              cannot find this point among the true nearest neighbors, then
01449 //              it assumed that the rank of the true nearest neighbor is true_nn+1.
01450 //
01451 //              Because of the possibility of duplicate distances, this is computed
01452 //              as follows.  For the j-th reported nearest neighbor, we count the
01453 //              number of true nearest neighbors that are at least this close.  Let
01454 //              this be rnk.  Then the rank error is max(0, j-rnk).  (In the code
01455 //              below, j is an array index and so the first item is 0, not 1.  Thus
01456 //              we take max(0, j+1-rnk) instead.)
01457 //
01458 //              For the results of fixed-radious range count, we verify that the
01459 //              reported number of points in the range lies between the actual
01460 //              number of points in the shrunken and the true search radius.
01461 //------------------------------------------------------------------------
01462 
01463 void doValidation()                                             // perform validation
01464 {
01465         int*              curr_apx_idx = apx_nn_idx;    // approx index pointer
01466         ANNdistArray  curr_apx_dst = apx_dists;         // approx distance pointer
01467         int*              curr_tru_idx = true_nn_idx;   // true index pointer
01468         ANNdistArray  curr_tru_dst = true_dists;        // true distance pointer
01469         int i, j;
01470 
01471         if (true_nn < near_neigh) {
01472                 Error("Cannot validate with fewer true near neighbors than actual", ANNabort);
01473         }
01474 
01475         for (i = 0; i < query_size; i++) {                      // validate each query
01476                 //----------------------------------------------------------------
01477                 //      Compute result errors
01478                 //              In fixed radius search it is possible that not all k
01479                 //              nearest neighbors were computed.  Because the true
01480                 //              results are computed over the shrunken radius, we should
01481                 //              have at least as many true nearest neighbors as
01482                 //              approximate nearest neighbors. (If not, an infinite
01483                 //              error will be generated, and so an internal error will
01484                 //              will be generated.
01485                 //
01486                 //              Because nearest neighbors are sorted in increasing order
01487                 //              of distance, as soon as we see a null index, we can
01488                 //              terminate the distance checking.  The error in the
01489                 //              result should not exceed epsilon.  However, if
01490                 //              max_pts_visit is nonzero (meaning that the search is
01491                 //              terminated early) this might happen.
01492                 //----------------------------------------------------------------
01493                 for (j = 0; j < near_neigh; j++) {
01494                         if (curr_tru_idx[j] == ANN_NULL_IDX)// no more true neighbors?
01495                                 break;
01496                                                                                                 // true i-th smallest distance
01497                         double true_dist = ANN_ROOT(curr_tru_dst[j]);
01498                                                                                                 // reported i-th smallest
01499                         double rept_dist = ANN_ROOT(curr_apx_dst[j]);
01500                                                                                                 // better than optimum?
01501                         if (rept_dist < true_dist*(1-ERR)) {
01502                                 Error("INTERNAL ERROR: True nearest neighbor incorrect",
01503                                                 ANNabort);
01504                         }
01505 
01506                         double resultErr;                                       // result error
01507                         if (true_dist == 0.0) {                         // let's not divide by zero
01508                                 if (rept_dist != 0.0) resultErr = ANN_DBL_MAX;
01509                                 else                              resultErr = 0.0;
01510                         }
01511                         else {
01512                                 resultErr = (rept_dist - true_dist) / ((double) true_dist);
01513                         }
01514 
01515                         if (resultErr > epsilon && max_pts_visit == 0) {
01516                                 Error("INTERNAL ERROR: Actual error exceeds epsilon",
01517                                                 ANNabort);
01518                         }
01519                         #ifdef ANN_PERF
01520                                 ann_average_err += resultErr;   // update statistics error
01521                         #endif
01522                 }
01523                 //--------------------------------------------------------------------
01524                 //  Compute rank errors (only needed for perf measurements)
01525                 //--------------------------------------------------------------------
01526                 #ifdef ANN_PERF
01527                         for (j = 0; j < near_neigh; j++) {
01528                                 if (curr_tru_idx[i] == ANN_NULL_IDX) // no more true neighbors?
01529                                         break;
01530 
01531                                 double rnkErr = 0.0;                    // rank error
01532                                                                                                 // reported j-th distance
01533                                 ANNdist rept_dist = curr_apx_dst[j];
01534                                 int rnk = 0;                                    // compute rank of this item
01535                                 while (rnk < true_nn && curr_tru_dst[rnk] <= rept_dist)
01536                                         rnk++;
01537                                 if (j+1-rnk > 0) rnkErr = (double) (j+1-rnk);
01538                                 ann_rank_err += rnkErr;                 // update average rank error
01539                         }
01540                 #endif
01541                 //----------------------------------------------------------------
01542                 //      Check range counts from fixed-radius query
01543                 //----------------------------------------------------------------
01544                 if (radius_bound != 0) {                                // fixed-radius search
01545                         if  (apx_pts_in_range[i] < min_pts_in_range[i] ||
01546                                  apx_pts_in_range[i] > max_pts_in_range[i])
01547                                 Error("INTERNAL ERROR: Invalid fixed-radius range count",
01548                                                 ANNabort);
01549                 }
01550 
01551                 curr_apx_idx += near_neigh;
01552                 curr_apx_dst += near_neigh;
01553                 curr_tru_idx += true_nn;                                // increment current pointers
01554                 curr_tru_dst += true_nn;
01555         }
01556 }
01557 
01558 //----------------------------------------------------------------------
01559 //      treeStats
01560 //              Computes a number of statistics related to kd_trees and
01561 //              bd_trees.  These statistics are printed if in verbose mode,
01562 //              and otherwise they are only printed if they are deemed to
01563 //              be outside of reasonable operating bounds.
01564 //----------------------------------------------------------------------
01565 
01566 #define log2(x) (log(x)/log(2.0))                               // log base 2
01567 
01568 void treeStats(
01569         ostream         &out,                                                   // output stream
01570         ANNbool         verbose)                                                // print stats
01571 {
01572         const int       MIN_PTS         = 20;                           // min no. pts for checking
01573         const float MAX_FRAC_TL = 0.50;                         // max frac of triv leaves
01574         const float MAX_AVG_AR  = 20;                           // max average aspect ratio
01575 
01576         ANNkdStats st;                                                          // statistics structure
01577 
01578         the_tree->getStats(st);                                         // get statistics
01579                                                                                                 // total number of nodes
01580         int n_nodes = st.n_lf + st.n_spl + st.n_shr;
01581                                                                                                 // should be O(n/bs)
01582         int opt_n_nodes = (int) (2*(float(st.n_pts)/st.bkt_size));
01583         int too_many_nodes = 10*opt_n_nodes;
01584         if (st.n_pts >= MIN_PTS && n_nodes > too_many_nodes) {
01585                 out << "-----------------------------------------------------------\n";
01586                 out << "Warning: The tree has more than 10x as many nodes as points.\n";
01587                 out << "You may want to consider a different split or shrink method.\n";
01588                 out << "-----------------------------------------------------------\n";
01589                 verbose = ANNtrue;
01590         }
01591                                                                                                 // fraction of trivial leaves
01592         float frac_tl = (st.n_lf == 0 ? 0 : ((float) st.n_tl)/ st.n_lf);
01593         if (st.n_pts >= MIN_PTS && frac_tl > MAX_FRAC_TL) {
01594                 out << "-----------------------------------------------------------\n";
01595                 out << "Warning: A significant fraction of leaves contain no points.\n";
01596                 out << "You may want to consider a different split or shrink method.\n";
01597                 out << "-----------------------------------------------------------\n";
01598                 verbose = ANNtrue;
01599         }
01600                                                                                                 // depth should be O(dim*log n)
01601         int too_many_levels = (int) (2.0 * st.dim * log2((double) st.n_pts));
01602         int opt_levels = (int) log2(double(st.n_pts)/st.bkt_size);
01603         if (st.n_pts >= MIN_PTS && st.depth > too_many_levels) {
01604                 out << "-----------------------------------------------------------\n";
01605                 out << "Warning: The tree is more than 2x as deep as (dim*log n).\n";
01606                 out << "You may want to consider a different split or shrink method.\n";
01607                 out << "-----------------------------------------------------------\n";
01608                 verbose = ANNtrue;
01609         }
01610                                                                                                 // average leaf aspect ratio
01611         if (st.n_pts >= MIN_PTS && st.avg_ar > MAX_AVG_AR) {
01612                 out << "-----------------------------------------------------------\n";
01613                 out << "Warning: Average aspect ratio of cells is quite large.\n";
01614                 out << "This may slow queries depending on the point distribution.\n";
01615                 out << "-----------------------------------------------------------\n";
01616                 verbose = ANNtrue;
01617         }
01618 
01619         //------------------------------------------------------------------
01620         //  Print summaries if requested
01621         //------------------------------------------------------------------
01622         if (verbose) {                                                          // output statistics
01623                 out << "  (Structure Statistics:\n";
01624                 out << "    n_nodes          = " << n_nodes
01625                         << " (opt = " << opt_n_nodes
01626                         << ", best if < " << too_many_nodes << ")\n"
01627                         << "        n_leaves     = " << st.n_lf
01628                         << " (" << st.n_tl << " contain no points)\n"
01629                         << "        n_splits     = " << st.n_spl << "\n"
01630                         << "        n_shrinks    = " << st.n_shr << "\n";
01631                 out << "    empty_leaves     = " << frac_tl*100
01632                         << " percent (best if < " << MAX_FRAC_TL*100 << " percent)\n";
01633                 out << "    depth            = " << st.depth
01634                         << " (opt = " << opt_levels
01635                         << ", best if < " << too_many_levels << ")\n";
01636                 out << "    avg_aspect_ratio = " << st.avg_ar
01637                         << " (best if < " << MAX_AVG_AR << ")\n";
01638                 out << "  )\n";
01639         }
01640 }

Generated on Wed Nov 23 18:59:54 2011 for FreeCAD by  doxygen 1.6.1