00001 //---------------------------------------------------------------------- 00002 // File: kd_split.cpp 00003 // Programmer: Sunil Arya and David Mount 00004 // Description: Methods for splitting kd-trees 00005 // Last modified: 01/04/05 (Version 1.0) 00006 //---------------------------------------------------------------------- 00007 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and 00008 // David Mount. All Rights Reserved. 00009 // 00010 // This software and related documentation is part of the Approximate 00011 // Nearest Neighbor Library (ANN). This software is provided under 00012 // the provisions of the Lesser GNU Public License (LGPL). See the 00013 // file ../ReadMe.txt for further information. 00014 // 00015 // The University of Maryland (U.M.) and the authors make no 00016 // representations about the suitability or fitness of this software for 00017 // any purpose. It is provided "as is" without express or implied 00018 // warranty. 00019 //---------------------------------------------------------------------- 00020 // History: 00021 // Revision 0.1 03/04/98 00022 // Initial release 00023 // Revision 1.0 04/01/05 00024 //---------------------------------------------------------------------- 00025 00026 #include "kd_tree.h" // kd-tree definitions 00027 #include "kd_util.h" // kd-tree utilities 00028 #include "kd_split.h" // splitting functions 00029 00030 //---------------------------------------------------------------------- 00031 // Constants 00032 //---------------------------------------------------------------------- 00033 00034 const double ERR = 0.001; // a small value 00035 const double FS_ASPECT_RATIO = 3.0; // maximum allowed aspect ratio 00036 // in fair split. Must be >= 2. 00037 00038 //---------------------------------------------------------------------- 00039 // kd_split - Bentley's standard splitting routine for kd-trees 00040 // Find the dimension of the greatest spread, and split 00041 // just before the median point along this dimension. 00042 //---------------------------------------------------------------------- 00043 00044 void kd_split( 00045 ANNpointArray pa, // point array (permuted on return) 00046 ANNidxArray pidx, // point indices 00047 const ANNorthRect &bnds, // bounding rectangle for cell 00048 int n, // number of points 00049 int dim, // dimension of space 00050 int &cut_dim, // cutting dimension (returned) 00051 ANNcoord &cut_val, // cutting value (returned) 00052 int &n_lo) // num of points on low side (returned) 00053 { 00054 // find dimension of maximum spread 00055 cut_dim = annMaxSpread(pa, pidx, n, dim); 00056 n_lo = n/2; // median rank 00057 // split about median 00058 annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); 00059 } 00060 00061 //---------------------------------------------------------------------- 00062 // midpt_split - midpoint splitting rule for box-decomposition trees 00063 // 00064 // This is the simplest splitting rule that guarantees boxes 00065 // of bounded aspect ratio. It simply cuts the box with the 00066 // longest side through its midpoint. If there are ties, it 00067 // selects the dimension with the maximum point spread. 00068 // 00069 // WARNING: This routine (while simple) doesn't seem to work 00070 // well in practice in high dimensions, because it tends to 00071 // generate a large number of trivial and/or unbalanced splits. 00072 // Either kd_split(), sl_midpt_split(), or fair_split() are 00073 // recommended, instead. 00074 //---------------------------------------------------------------------- 00075 00076 void midpt_split( 00077 ANNpointArray pa, // point array 00078 ANNidxArray pidx, // point indices (permuted on return) 00079 const ANNorthRect &bnds, // bounding rectangle for cell 00080 int n, // number of points 00081 int dim, // dimension of space 00082 int &cut_dim, // cutting dimension (returned) 00083 ANNcoord &cut_val, // cutting value (returned) 00084 int &n_lo) // num of points on low side (returned) 00085 { 00086 int d; 00087 00088 ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; 00089 for (d = 1; d < dim; d++) { // find length of longest box side 00090 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00091 if (length > max_length) { 00092 max_length = length; 00093 } 00094 } 00095 ANNcoord max_spread = -1; // find long side with most spread 00096 for (d = 0; d < dim; d++) { 00097 // is it among longest? 00098 if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) { 00099 // compute its spread 00100 ANNcoord spr = annSpread(pa, pidx, n, d); 00101 if (spr > max_spread) { // is it max so far? 00102 max_spread = spr; 00103 cut_dim = d; 00104 } 00105 } 00106 } 00107 // split along cut_dim at midpoint 00108 cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2; 00109 // permute points accordingly 00110 int br1, br2; 00111 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00112 //------------------------------------------------------------------ 00113 // On return: pa[0..br1-1] < cut_val 00114 // pa[br1..br2-1] == cut_val 00115 // pa[br2..n-1] > cut_val 00116 // 00117 // We can set n_lo to any value in the range [br1..br2]. 00118 // We choose split so that points are most evenly divided. 00119 //------------------------------------------------------------------ 00120 if (br1 > n/2) n_lo = br1; 00121 else if (br2 < n/2) n_lo = br2; 00122 else n_lo = n/2; 00123 } 00124 00125 //---------------------------------------------------------------------- 00126 // sl_midpt_split - sliding midpoint splitting rule 00127 // 00128 // This is a modification of midpt_split, which has the nonsensical 00129 // name "sliding midpoint". The idea is that we try to use the 00130 // midpoint rule, by bisecting the longest side. If there are 00131 // ties, the dimension with the maximum spread is selected. If, 00132 // however, the midpoint split produces a trivial split (no points 00133 // on one side of the splitting plane) then we slide the splitting 00134 // (maintaining its orientation) until it produces a nontrivial 00135 // split. For example, if the splitting plane is along the x-axis, 00136 // and all the data points have x-coordinate less than the x-bisector, 00137 // then the split is taken along the maximum x-coordinate of the 00138 // data points. 00139 // 00140 // Intuitively, this rule cannot generate trivial splits, and 00141 // hence avoids midpt_split's tendency to produce trees with 00142 // a very large number of nodes. 00143 // 00144 //---------------------------------------------------------------------- 00145 00146 void sl_midpt_split( 00147 ANNpointArray pa, // point array 00148 ANNidxArray pidx, // point indices (permuted on return) 00149 const ANNorthRect &bnds, // bounding rectangle for cell 00150 int n, // number of points 00151 int dim, // dimension of space 00152 int &cut_dim, // cutting dimension (returned) 00153 ANNcoord &cut_val, // cutting value (returned) 00154 int &n_lo) // num of points on low side (returned) 00155 { 00156 int d; 00157 00158 ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; 00159 for (d = 1; d < dim; d++) { // find length of longest box side 00160 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00161 if (length > max_length) { 00162 max_length = length; 00163 } 00164 } 00165 ANNcoord max_spread = -1; // find long side with most spread 00166 for (d = 0; d < dim; d++) { 00167 // is it among longest? 00168 if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) { 00169 // compute its spread 00170 ANNcoord spr = annSpread(pa, pidx, n, d); 00171 if (spr > max_spread) { // is it max so far? 00172 max_spread = spr; 00173 cut_dim = d; 00174 } 00175 } 00176 } 00177 // ideal split at midpoint 00178 ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2; 00179 00180 ANNcoord min, max; 00181 annMinMax(pa, pidx, n, cut_dim, min, max); // find min/max coordinates 00182 00183 if (ideal_cut_val < min) // slide to min or max as needed 00184 cut_val = min; 00185 else if (ideal_cut_val > max) 00186 cut_val = max; 00187 else 00188 cut_val = ideal_cut_val; 00189 00190 // permute points accordingly 00191 int br1, br2; 00192 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00193 //------------------------------------------------------------------ 00194 // On return: pa[0..br1-1] < cut_val 00195 // pa[br1..br2-1] == cut_val 00196 // pa[br2..n-1] > cut_val 00197 // 00198 // We can set n_lo to any value in the range [br1..br2] to satisfy 00199 // the exit conditions of the procedure. 00200 // 00201 // if ideal_cut_val < min (implying br2 >= 1), 00202 // then we select n_lo = 1 (so there is one point on left) and 00203 // if ideal_cut_val > max (implying br1 <= n-1), 00204 // then we select n_lo = n-1 (so there is one point on right). 00205 // Otherwise, we select n_lo as close to n/2 as possible within 00206 // [br1..br2]. 00207 //------------------------------------------------------------------ 00208 if (ideal_cut_val < min) n_lo = 1; 00209 else if (ideal_cut_val > max) n_lo = n-1; 00210 else if (br1 > n/2) n_lo = br1; 00211 else if (br2 < n/2) n_lo = br2; 00212 else n_lo = n/2; 00213 } 00214 00215 //---------------------------------------------------------------------- 00216 // fair_split - fair-split splitting rule 00217 // 00218 // This is a compromise between the kd-tree splitting rule (which 00219 // always splits data points at their median) and the midpoint 00220 // splitting rule (which always splits a box through its center. 00221 // The goal of this procedure is to achieve both nicely balanced 00222 // splits, and boxes of bounded aspect ratio. 00223 // 00224 // A constant FS_ASPECT_RATIO is defined. Given a box, those sides 00225 // which can be split so that the ratio of the longest to shortest 00226 // side does not exceed ASPECT_RATIO are identified. Among these 00227 // sides, we select the one in which the points have the largest 00228 // spread. We then split the points in a manner which most evenly 00229 // distributes the points on either side of the splitting plane, 00230 // subject to maintaining the bound on the ratio of long to short 00231 // sides. To determine that the aspect ratio will be preserved, 00232 // we determine the longest side (other than this side), and 00233 // determine how narrowly we can cut this side, without causing the 00234 // aspect ratio bound to be exceeded (small_piece). 00235 // 00236 // This procedure is more robust than either kd_split or midpt_split, 00237 // but is more complicated as well. When point distribution is 00238 // extremely skewed, this degenerates to midpt_split (actually 00239 // 1/3 point split), and when the points are most evenly distributed, 00240 // this degenerates to kd-split. 00241 //---------------------------------------------------------------------- 00242 00243 void fair_split( 00244 ANNpointArray pa, // point array 00245 ANNidxArray pidx, // point indices (permuted on return) 00246 const ANNorthRect &bnds, // bounding rectangle for cell 00247 int n, // number of points 00248 int dim, // dimension of space 00249 int &cut_dim, // cutting dimension (returned) 00250 ANNcoord &cut_val, // cutting value (returned) 00251 int &n_lo) // num of points on low side (returned) 00252 { 00253 int d; 00254 ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; 00255 cut_dim = 0; 00256 for (d = 1; d < dim; d++) { // find length of longest box side 00257 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00258 if (length > max_length) { 00259 max_length = length; 00260 cut_dim = d; 00261 } 00262 } 00263 00264 ANNcoord max_spread = 0; // find legal cut with max spread 00265 cut_dim = 0; 00266 for (d = 0; d < dim; d++) { 00267 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00268 // is this side midpoint splitable 00269 // without violating aspect ratio? 00270 if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) { 00271 // compute spread along this dim 00272 ANNcoord spr = annSpread(pa, pidx, n, d); 00273 if (spr > max_spread) { // best spread so far 00274 max_spread = spr; 00275 cut_dim = d; // this is dimension to cut 00276 } 00277 } 00278 } 00279 00280 max_length = 0; // find longest side other than cut_dim 00281 for (d = 0; d < dim; d++) { 00282 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00283 if (d != cut_dim && length > max_length) 00284 max_length = length; 00285 } 00286 // consider most extreme splits 00287 ANNcoord small_piece = max_length / FS_ASPECT_RATIO; 00288 ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut 00289 ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut 00290 00291 int br1, br2; 00292 // is median below lo_cut ? 00293 if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) { 00294 cut_val = lo_cut; // cut at lo_cut 00295 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00296 n_lo = br1; 00297 } 00298 // is median above hi_cut? 00299 else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) { 00300 cut_val = hi_cut; // cut at hi_cut 00301 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00302 n_lo = br2; 00303 } 00304 else { // median cut preserves asp ratio 00305 n_lo = n/2; // split about median 00306 annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); 00307 } 00308 } 00309 00310 //---------------------------------------------------------------------- 00311 // sl_fair_split - sliding fair split splitting rule 00312 // 00313 // Sliding fair split is a splitting rule that combines the 00314 // strengths of both fair split with sliding midpoint split. 00315 // Fair split tends to produce balanced splits when the points 00316 // are roughly uniformly distributed, but it can produce many 00317 // trivial splits when points are highly clustered. Sliding 00318 // midpoint never produces trivial splits, and shrinks boxes 00319 // nicely if points are highly clustered, but it may produce 00320 // rather unbalanced splits when points are unclustered but not 00321 // quite uniform. 00322 // 00323 // Sliding fair split is based on the theory that there are two 00324 // types of splits that are "good": balanced splits that produce 00325 // fat boxes, and unbalanced splits provided the cell with fewer 00326 // points is fat. 00327 // 00328 // This splitting rule operates by first computing the longest 00329 // side of the current bounding box. Then it asks which sides 00330 // could be split (at the midpoint) and still satisfy the aspect 00331 // ratio bound with respect to this side. Among these, it selects 00332 // the side with the largest spread (as fair split would). It 00333 // then considers the most extreme cuts that would be allowed by 00334 // the aspect ratio bound. This is done by dividing the longest 00335 // side of the box by the aspect ratio bound. If the median cut 00336 // lies between these extreme cuts, then we use the median cut. 00337 // If not, then consider the extreme cut that is closer to the 00338 // median. If all the points lie to one side of this cut, then 00339 // we slide the cut until it hits the first point. This may 00340 // violate the aspect ratio bound, but will never generate empty 00341 // cells. However the sibling of every such skinny cell is fat, 00342 // and hence packing arguments still apply. 00343 // 00344 //---------------------------------------------------------------------- 00345 00346 void sl_fair_split( 00347 ANNpointArray pa, // point array 00348 ANNidxArray pidx, // point indices (permuted on return) 00349 const ANNorthRect &bnds, // bounding rectangle for cell 00350 int n, // number of points 00351 int dim, // dimension of space 00352 int &cut_dim, // cutting dimension (returned) 00353 ANNcoord &cut_val, // cutting value (returned) 00354 int &n_lo) // num of points on low side (returned) 00355 { 00356 int d; 00357 ANNcoord min, max; // min/max coordinates 00358 int br1, br2; // split break points 00359 00360 ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; 00361 cut_dim = 0; 00362 for (d = 1; d < dim; d++) { // find length of longest box side 00363 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00364 if (length > max_length) { 00365 max_length = length; 00366 cut_dim = d; 00367 } 00368 } 00369 00370 ANNcoord max_spread = 0; // find legal cut with max spread 00371 cut_dim = 0; 00372 for (d = 0; d < dim; d++) { 00373 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00374 // is this side midpoint splitable 00375 // without violating aspect ratio? 00376 if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) { 00377 // compute spread along this dim 00378 ANNcoord spr = annSpread(pa, pidx, n, d); 00379 if (spr > max_spread) { // best spread so far 00380 max_spread = spr; 00381 cut_dim = d; // this is dimension to cut 00382 } 00383 } 00384 } 00385 00386 max_length = 0; // find longest side other than cut_dim 00387 for (d = 0; d < dim; d++) { 00388 ANNcoord length = bnds.hi[d] - bnds.lo[d]; 00389 if (d != cut_dim && length > max_length) 00390 max_length = length; 00391 } 00392 // consider most extreme splits 00393 ANNcoord small_piece = max_length / FS_ASPECT_RATIO; 00394 ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut 00395 ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut 00396 // find min and max along cut_dim 00397 annMinMax(pa, pidx, n, cut_dim, min, max); 00398 // is median below lo_cut? 00399 if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) { 00400 if (max > lo_cut) { // are any points above lo_cut? 00401 cut_val = lo_cut; // cut at lo_cut 00402 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00403 n_lo = br1; // balance if there are ties 00404 } 00405 else { // all points below lo_cut 00406 cut_val = max; // cut at max value 00407 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00408 n_lo = n-1; 00409 } 00410 } 00411 // is median above hi_cut? 00412 else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) { 00413 if (min < hi_cut) { // are any points below hi_cut? 00414 cut_val = hi_cut; // cut at hi_cut 00415 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00416 n_lo = br2; // balance if there are ties 00417 } 00418 else { // all points above hi_cut 00419 cut_val = min; // cut at min value 00420 annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); 00421 n_lo = 1; 00422 } 00423 } 00424 else { // median cut is good enough 00425 n_lo = n/2; // split about median 00426 annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); 00427 } 00428 }