latest version v1.9 - last update 10 Apr 2010 |
00001 /* 00002 * Copyright (C) 2003, 2004, 2005, 2006 00003 * Lehrstuhl fuer Technische Informatik, RWTH-Aachen, Germany 00004 * 00005 * This file is part of the LTI-Computer Vision Library (LTI-Lib) 00006 * 00007 * The LTI-Lib is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public License (LGPL) 00009 * as published by the Free Software Foundation; either version 2.1 of 00010 * the License, or (at your option) any later version. 00011 * 00012 * The LTI-Lib is distributed in the hope that it will be 00013 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 00014 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with the LTI-Lib; see the file LICENSE. If 00019 * not, write to the Free Software Foundation, Inc., 59 Temple Place - 00020 * Suite 330, Boston, MA 02111-1307, USA. 00021 */ 00022 00023 00024 /*-------------------------------------------------------------------- 00025 * project ....: LTI-Lib: Image Processing and Computer Vision Library 00026 * file .......: ltiDBScan.h 00027 * authors ....: Bastian Ibach 00028 * organization: LTI, RWTH Aachen 00029 * creation ...: 28.4.2003 00030 * revisions ..: $Id: ltiDBScan.h,v 1.10 2006/02/07 18:16:37 ltilib Exp $ 00031 */ 00032 00033 #ifndef _LTI_D_B_SCAN_H_ 00034 #define _LTI_D_B_SCAN_H_ 00035 00036 //TODO: include only those files which are needed in this header!! 00037 00038 // TODO: Check this include to parent class: 00039 #include "ltiClustering.h" 00040 #include "ltiKdTree.h" 00041 #include "ltiL2Distance.h" 00042 00043 // std / stl includes 00044 #include <list> 00045 00046 #undef _LTI_DEBUG 00047 //#define _LTI_DEBUG 3 // comment out this line to remove all debug info. 00048 #include "ltiDebug.h" 00049 00050 namespace lti { 00051 00052 /** 00053 * A class providing the DBScan for clustering 00054 * 00055 * DBScan is a density-based clustering algorithm, which is designed to find 00056 * clusters of abitrary shape. 00057 * 00058 * Using one point the algorithm searches for minPts in the eps-neighborhood 00059 * of this point. If not enough points are found, they are classified as 00060 * "NOISE" (Value = 0) else it tries to expand this found cluster by using 00061 * each found point for another search and assigns the current cluster ID to 00062 * each point. If then not enough points are found an new cluster is 00063 * created using an not jet classified point. The clustering finishes if 00064 * all points are clussified, that means either having a cluster ID or 00065 * marked as "NOISE". 00066 * 00067 * The original algorithm uses a R*-Tree as internal data struckture, this 00068 * implementation uses the lti::kdTree. The trees provide an efficient 00069 * method for searching points in an eps-neighborhood. 00070 * 00071 * DBScan requires three parameters: 00072 * 00073 * - \e buckedSize the number of elements stored in each leaf of the 00074 * kd-Tree. The default value is 2. 00075 * - \e minPts the minimum number of points which have to be found in the 00076 * eps neighborhood. The default value is 4. 00077 * - \e eps <b>IMPORTANT:</b> The size of the eps-neighborhood used for 00078 * searching. This parameter <b>MUST</b> be set by the user. The default 00079 * value is 0 and if it isn't change, the algorithms returns false. 00080 * 00081 * The eps is the critical parameter of this algorithm, because the number 00082 * of clusters depends on it. A good value would be the density of the 00083 * least dense cluster. But it is very hard to get this information on 00084 * advance. Normally one does not know the distribution of the points in 00085 * the space. If no cluster is found, all points are marked as noise! 00086 * 00087 * The type T describes the distance measure used in the internal data 00088 * structure. The Default value is l2SquareDistance 00089 * 00090 * The original DBScan algorithm was introduced in: 00091 * 00092 * Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu A 00093 * Density-Based Algorithm for Discovering Clusters in Large Spatial 00094 * Databases with Noise. Proceedings of 2nd International Conference on 00095 * Knowledge Discovery and Data Mining (KDD-96) 00096 * 00097 * URL: http://www.dbs.informatik.uni-muenchen.de/Publikationen/Papers/ 00098 * KDD-96.final.frame.ps 00099 */ 00100 00101 template <class T=l2SquareDistantor<vector<double> > > 00102 class DBScan : public clustering { 00103 public: 00104 00105 /** 00106 * the parameters for the class DBScan 00107 */ 00108 class parameters : public clustering::parameters { 00109 public: 00110 /** 00111 * default constructor 00112 */ 00113 parameters() : clustering::parameters(){ 00114 eps = double(1.0); 00115 minPts = int(6); 00116 buckedSize = int(4); 00117 distanceMeasure = L2SquareDistance; 00118 } 00119 00120 /** 00121 * copy constructor 00122 * @param other the parameters object to be copied 00123 */ 00124 parameters(const parameters& other) 00125 : clustering::parameters() { 00126 copy(other); 00127 } 00128 00129 /** 00130 * destructor 00131 */ 00132 virtual ~parameters(){} 00133 00134 /** 00135 * returns name of this type 00136 */ 00137 const char* getTypeName() const{ 00138 return "DBScan:parameters"; 00139 } 00140 00141 /** 00142 * copy the contents of a parameters object 00143 * @param other the parameters object to be copied 00144 * @return a reference to this parameters object 00145 */ 00146 parameters& copy(const parameters& other){ 00147 # ifndef _LTI_MSC_6 00148 // MS Visual C++ 6 is not able to compile this... 00149 clustering::parameters::copy(other); 00150 # else 00151 // ...so we have to use this workaround. 00152 // Conditional on that, copy may not be virtual. 00153 clustering::parameters& (clustering::parameters::* p_copy) 00154 (const clustering::parameters&) = 00155 clustering::parameters::copy; 00156 (this->*p_copy)(other); 00157 # endif 00158 eps = other.eps; 00159 minPts = other.minPts; 00160 buckedSize = other.buckedSize; 00161 distanceMeasure = other.distanceMeasure; 00162 return *this; 00163 }; 00164 00165 00166 /** 00167 * copy the contents of a parameters object 00168 * @param other the parameters object to be copied 00169 * @return a reference to this parameters object 00170 */ 00171 parameters& operator=(const parameters& other){ 00172 return copy(other); 00173 }; 00174 00175 00176 /** 00177 * returns a pointer to a clone of the parameters 00178 */ 00179 virtual classifier::parameters* clone() const{ 00180 return new parameters(*this); 00181 }; 00182 00183 /** 00184 * write the parameters in the given ioHandler 00185 * @param handler the ioHandler to be used 00186 * @param complete if true (the default) the enclosing begin/end will 00187 * be also written, otherwise only the data block will be written. 00188 * @return true if write was successful 00189 */ 00190 # ifndef _LTI_MSC_6 00191 bool write(ioHandler& handler,const bool complete=true) const 00192 # else 00193 bool writeMS(ioHandler& handler,const bool complete=true) const 00194 # endif 00195 { 00196 bool b = true; 00197 if (complete) { 00198 b = handler.writeBegin(); 00199 } 00200 00201 if (b) { 00202 lti::write(handler,"eps",eps); 00203 lti::write(handler,"minPts",minPts); 00204 lti::write(handler,"buckedSize",buckedSize); 00205 if (distanceMeasure==L1Distance) { 00206 lti::write(handler,"distanceMeasure","L1Distance"); 00207 } else if (distanceMeasure==L2Distance) { 00208 lti::write(handler,"distanceMeasure","L2Distance"); 00209 } else { 00210 lti::write(handler,"distanceMeasure","L2SquareDistance"); 00211 } 00212 } 00213 00214 # ifndef _LTI_MSC_6 00215 // This is the standard C++ code, which MS Visual C++ 6 is not able to 00216 // compile... 00217 b = b && clustering::parameters::write(handler,false); 00218 # else 00219 bool (clustering::parameters::* p_writeMS)(ioHandler&, 00220 const bool) const = 00221 clustering::parameters::writeMS; 00222 b = b && (this->*p_writeMS)(handler,false); 00223 # endif 00224 00225 if (complete) { 00226 b = b && handler.writeEnd(); 00227 } 00228 00229 return b; 00230 } 00231 00232 # ifdef _LTI_MSC_6 00233 virtual bool write(ioHandler& handler, 00234 const bool complete = true) const { 00235 // ...we need this workaround to cope with another really 00236 // awful MSVC bug. 00237 return writeMS(handler,complete); 00238 } 00239 # endif 00240 00241 00242 00243 /** 00244 * read the parameters from the given ioHandler 00245 * @param handler the ioHandler to be used 00246 * @param complete if true (the default) the enclosing begin/end will 00247 * be also written, otherwise only the data block will be written. 00248 * @return true if write was successful 00249 */ 00250 # ifndef _LTI_MSC_6 00251 virtual bool read(ioHandler& handler,const bool complete = true) 00252 # else 00253 bool readMS(ioHandler& handler,const bool complete=true) 00254 # endif 00255 { 00256 bool b = true; 00257 if (complete) { 00258 b = handler.readBegin(); 00259 } 00260 00261 if (b) { 00262 lti::read(handler,"eps",eps); 00263 lti::read(handler,"minPts",minPts); 00264 lti::read(handler,"buckedSize",buckedSize); 00265 std::string tmpStr; 00266 lti::read(handler,"distanceMeasure",tmpStr); 00267 if (tmpStr=="L1Distance") { 00268 distanceMeasure=L1Distance; 00269 } else if (tmpStr=="L2Distance") { 00270 distanceMeasure=L2Distance; 00271 } else { 00272 distanceMeasure=L2SquareDistance; 00273 } 00274 00275 } 00276 00277 # ifndef _LTI_MSC_6 00278 // This is the standard C++ code, which MS Visual C++ 6 is not able to 00279 // compile... 00280 b = b && clustering::parameters::read(handler,false); 00281 # else 00282 bool (clustering::parameters::* p_readMS)(ioHandler&, 00283 const bool) = 00284 clustering::parameters::readMS; 00285 b = b && (this->*p_readMS)(handler,false); 00286 # endif 00287 00288 if (complete) { 00289 b = b && handler.readEnd(); 00290 } 00291 00292 return b; 00293 } 00294 00295 # ifdef _LTI_MSC_6 00296 virtual bool read(ioHandler& handler,const bool complete=true) { 00297 // ...we need this workaround to cope with another really awful MSVC 00298 // bug. 00299 return readMS(handler,complete); 00300 } 00301 # endif 00302 00303 00304 00305 //# ifdef _LTI_MSC_6 00306 /** 00307 * this function is required by MSVC only, as a workaround for a 00308 * very awful bug, which exists since MSVC V.4.0, and still by 00309 * V.6.0 with all bugfixes (so called "service packs") remains 00310 * there... This method is also public due to another bug, so please 00311 * NEVER EVER call this method directly: use read() instead 00312 */ 00313 // bool readMS(ioHandler& handler,const bool complete=true); 00314 00315 /** 00316 * this function is required by MSVC only, as a workaround for a 00317 * very awful bug, which exists since MSVC V.4.0, and still by 00318 * V.6.0 with all bugfixes (so called "service packs") remains 00319 * there... This method is also public due to another bug, so please 00320 * NEVER EVER call this method directly: use write() instead 00321 */ 00322 // bool writeMS(ioHandler& handler,const bool complete=true) const; 00323 //# endif 00324 00325 // ------------------------------------------------ 00326 // the parameters 00327 // ------------------------------------------------ 00328 00329 enum eDistanceType { 00330 L1Distance, 00331 L2Distance, 00332 L2SquareDistance 00333 }; 00334 00335 /** 00336 * Define eps region for a point, where neighbour points must be in 00337 */ 00338 double eps; 00339 00340 /** 00341 * Number of points which have to be in the eps-neighbourhood 00342 * otherwise the points are defined as noise 00343 */ 00344 int minPts; 00345 00346 /** 00347 * number of elements stored in each leaf of the kd-tree, 00348 * which is used als internal datastructure of DBScan 00349 */ 00350 int buckedSize; 00351 00352 /** 00353 * The kind of distance measurement that is used for computing the 00354 * distances between the data points. 00355 * Values are: l1Distance, l2Distance, l2SquareDistance 00356 */ 00357 eDistanceType distanceMeasure; 00358 00359 }; 00360 00361 //-------------------------------------------------------- 00362 // class DBSCan 00363 //-------------------------------------------------------- 00364 00365 /** 00366 * default constructor 00367 * reads and sets the parameters 00368 */ 00369 DBScan(); 00370 00371 /** 00372 * copy constructor 00373 * @param other the object to be copied 00374 */ 00375 DBScan(const DBScan& other); 00376 00377 /** 00378 * destructor 00379 */ 00380 virtual ~DBScan(); 00381 00382 /** 00383 * returns the name of this type ("DBScan") 00384 */ 00385 virtual const char* getTypeName() const; 00386 00387 /** 00388 * copy data of "other" clustering. 00389 * @param other the clustering to be copied 00390 * @return a reference to this clustering object 00391 */ 00392 DBScan& copy(const DBScan& other); 00393 00394 /** 00395 * alias for copy member 00396 * @param other the clustering to be copied 00397 * @return a reference to this clustering object 00398 */ 00399 DBScan& operator=(const DBScan& other); 00400 00401 /** 00402 * returns a pointer to a clone of this clustering. 00403 */ 00404 virtual classifier* clone() const; 00405 00406 /** 00407 * returns used parameters 00408 */ 00409 const parameters& getParameters() const; 00410 00411 /** 00412 * Unsupervised training. The vectors in the <code>input</code> 00413 * matrix will be put into groups according to the training 00414 * algorithm. Additionally, an integer indicating the class each 00415 * point belongs to is returned. <p> By default this method uses 00416 * the other train method train(const dmatrix&) and then calls 00417 * classify(const dvector&,outputVector&) to get the ids for each 00418 * train vector. These ids are then returned. 00419 * Train will return false, if no cluster is found and all points are 00420 * marked as noise! 00421 * @param input the matrix with the input vectors (each row is a training 00422 * vector) 00423 * @param ids vector of class ids for each input point 00424 * @return true if successful, false otherwise. (if false you can check 00425 * the error message with getStatusString()) 00426 */ 00427 virtual bool train(const dmatrix& input, 00428 ivector& ids); 00429 00430 /** 00431 * Unsupervised training. 00432 * The vectors in the <code>input</code> matrix 00433 * will be clustered using each specific method. 00434 * Train will return false, if no cluster is found. 00435 * @param input the matrix with the input vectors (each row is a training 00436 * vector) 00437 * @return true if successful, false otherwise. (if false you can check 00438 * the error message with getStatusString()) 00439 */ 00440 virtual bool train(const dmatrix& input); 00441 00442 //TODO Check whether you really need a new classify method. 00443 // In some cases the superclasses method will suffice. Then just 00444 // delete the declaration and its implementation stump. 00445 00446 /** classify 00447 * @param feature vector to be classified 00448 * @param result result as described above 00449 * @return true if successful, false otherwise 00450 */ 00451 virtual bool classify(const dvector& feature, outputVector& result) const; 00452 00453 00454 00455 #ifdef _LTI_DEBUG 00456 /** 00457 * showClustering for 2D points 00458 */ 00459 void showClustering(typename kdTree< vector<double>,int,T >::node* nPtr, 00460 kdTree< vector<double>,int,T >& Tree); 00461 00462 #endif 00463 00464 protected: 00465 00466 // typedefs 00467 typedef kdTree< vector<double>,int,T > dKdTree; 00468 00469 typedef typename dKdTree::node dNode; 00470 00471 typedef typename dKdTree::element dElement; 00472 00473 /** 00474 * DBScan algorithm working with the kdTree as data struckture 00475 * @param setOfPoints a kd-Tree containing the data to be clustered 00476 */ 00477 virtual bool dbScan(dKdTree& setOfPoints); 00478 00479 /** 00480 * The dbScan algorithm. 00481 * walks rekursively through the tree, if one point ist found, the 00482 * clustering is started 00483 */ 00484 void cluster(dNode* nPtr, 00485 dKdTree* setOfPoints); 00486 00487 /** 00488 * expandCluster tries to expand the cluster found, with respect of 00489 * eps, minPts, density-reachability and desity-connectivity 00490 * @param setOfPoints pointer to the used kd-Tree 00491 * @param point one point/element of the data stored in the kd-Tree 00492 * @param clusterID the ID of the current cluster 00493 * returns true if the cluster could be expanded 00494 */ 00495 bool expandCluster(dKdTree* setOfPoints, 00496 dElement* point, 00497 int clusterID); 00498 00499 /** 00500 * check if elements of node have been clustered, if the node is a leaf 00501 * returns true if all elements are clusterd, else false 00502 */ 00503 bool allElementsClustered(dNode* nPtr); 00504 00505 /** 00506 * eps neighbourhood 00507 */ 00508 double eps; 00509 00510 /** 00511 * minPts 00512 */ 00513 int minPts; 00514 00515 /** 00516 * clusterID 00517 */ 00518 int clusterID; 00519 00520 /** 00521 * kd-tree used for effective search on elements 00522 * internal used data strukture 00523 */ 00524 dKdTree setOfPoints; 00525 00526 /** 00527 * bucked-size of the kd-tree 00528 * number of elements stored in one leaf 00529 * default value: 2 00530 */ 00531 int buckedSize; 00532 00533 private: 00534 00535 /** 00536 * iterate over setOfPoints and add all points belonging to one 00537 * cluster to the same kd-Tree in kdClusterVec 00538 * 00539 */ 00540 void buildKdClusterVec(dNode* nPtr); 00541 00542 00543 /** 00544 * vector of kd-trees, each kd-tree represents one cluster 00545 * used for classification of points 00546 * the vector is initialized after the clustering 00547 */ 00548 std::vector<dKdTree*> kdClusterVec; 00549 00550 /** 00551 * vector of kdTree::elements one element of each cluster 00552 */ 00553 std::vector<dElement*> startElements; 00554 00555 /** 00556 * minimal distance to other clusters for each cluster 00557 * the vector index corresponds to the clusterID-1 00558 */ 00559 vector<double> minDistances; 00560 }; 00561 } 00562 00563 #include "ltiUndebug.h" 00564 00565 00566 #endif 00567 00568