LTI-Lib latest version v1.9 - last update 10 Apr 2010

ltiDBScan.h

00001 /*
00002  * Copyright (C) 2003, 2004, 2005, 2006
00003  * Lehrstuhl fuer Technische Informatik, RWTH-Aachen, Germany
00004  *
00005  * This file is part of the LTI-Computer Vision Library (LTI-Lib)
00006  *
00007  * The LTI-Lib is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public License (LGPL)
00009  * as published by the Free Software Foundation; either version 2.1 of
00010  * the License, or (at your option) any later version.
00011  *
00012  * The LTI-Lib is distributed in the hope that it will be
00013  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
00014  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with the LTI-Lib; see the file LICENSE.  If
00019  * not, write to the Free Software Foundation, Inc., 59 Temple Place -
00020  * Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 
00023 
00024 /*--------------------------------------------------------------------
00025  * project ....: LTI-Lib: Image Processing and Computer Vision Library
00026  * file .......: ltiDBScan.h
00027  * authors ....: Bastian Ibach
00028  * organization: LTI, RWTH Aachen
00029  * creation ...: 28.4.2003
00030  * revisions ..: $Id: ltiDBScan.h,v 1.10 2006/02/07 18:16:37 ltilib Exp $
00031  */
00032 
00033 #ifndef _LTI_D_B_SCAN_H_
00034 #define _LTI_D_B_SCAN_H_
00035 
00036 //TODO: include only those files which are needed in this header!!
00037 
00038 // TODO: Check this include to parent class: 
00039 #include "ltiClustering.h"
00040 #include "ltiKdTree.h"
00041 #include "ltiL2Distance.h"
00042 
00043 // std / stl includes
00044 #include <list>
00045 
00046  #undef _LTI_DEBUG
00047 //#define _LTI_DEBUG 3 // comment out this line to remove all debug info.
00048  #include "ltiDebug.h"
00049 
00050 namespace lti {
00051 
00052   /**
00053    * A class providing the DBScan for clustering
00054    *
00055    * DBScan is a density-based clustering algorithm, which is designed to find
00056    * clusters of abitrary shape. 
00057    *
00058    * Using one point the algorithm searches for minPts in the eps-neighborhood
00059    * of this point. If not enough points are found, they are classified as
00060    * "NOISE" (Value = 0) else it tries to expand this found cluster by using
00061    * each found point for another search and assigns the current cluster ID to
00062    * each point.  If then not enough points are found an new cluster is
00063    * created using an not jet classified point.  The clustering finishes if
00064    * all points are clussified, that means either having a cluster ID or
00065    * marked as "NOISE".
00066    *
00067    * The original algorithm uses a R*-Tree as internal data struckture, this
00068    * implementation uses the lti::kdTree.  The trees provide an efficient
00069    * method for searching points in an eps-neighborhood.
00070    *
00071    * DBScan requires three parameters:
00072    *
00073    * - \e buckedSize the number of elements stored in each leaf of the
00074    *   kd-Tree. The default value is 2.
00075    * - \e minPts the minimum number of points which have to be found in the
00076    *   eps neighborhood. The default value is 4.
00077    * - \e eps <b>IMPORTANT:</b> The size of the eps-neighborhood used for
00078    *   searching.  This parameter <b>MUST</b> be set by the user. The default
00079    *   value is 0 and if it isn't change, the algorithms returns false.
00080    *
00081    * The eps is the critical parameter of this algorithm, because the number
00082    * of clusters depends on it.  A good value would be the density of the
00083    * least dense cluster. But it is very hard to get this information on
00084    * advance.  Normally one does not know the distribution of the points in
00085    * the space. If no cluster is found, all points are marked as noise!
00086    *
00087    * The type T describes the distance measure used in the internal data 
00088    * structure. The Default value is l2SquareDistance
00089    * 
00090    * The original DBScan algorithm was introduced in:
00091    *
00092    * Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu A
00093    * Density-Based Algorithm for Discovering Clusters in Large Spatial
00094    * Databases with Noise. Proceedings of 2nd International Conference on
00095    * Knowledge Discovery and Data Mining (KDD-96)
00096    *
00097    * URL: http://www.dbs.informatik.uni-muenchen.de/Publikationen/Papers/
00098    * KDD-96.final.frame.ps
00099    */
00100 
00101   template <class T=l2SquareDistantor<vector<double> > >
00102   class DBScan : public clustering {
00103   public: 
00104     
00105     /**
00106      * the parameters for the class DBScan
00107      */
00108     class parameters : public clustering::parameters {
00109     public:
00110       /**
00111        * default constructor
00112        */
00113       parameters() : clustering::parameters(){
00114         eps = double(1.0);
00115         minPts = int(6);
00116         buckedSize = int(4);
00117         distanceMeasure = L2SquareDistance;
00118       }
00119 
00120       /**
00121        * copy constructor
00122        * @param other the parameters object to be copied
00123        */
00124       parameters(const parameters& other)
00125         : clustering::parameters() {
00126         copy(other);
00127       }
00128 
00129       /**
00130        * destructor
00131        */
00132       virtual ~parameters(){}
00133 
00134       /**
00135        * returns name of this type
00136        */
00137       const char* getTypeName() const{
00138     return "DBScan:parameters";
00139       }
00140 
00141       /**
00142        * copy the contents of a parameters object
00143        * @param other the parameters object to be copied
00144        * @return a reference to this parameters object
00145        */
00146       parameters& copy(const parameters& other){
00147 #     ifndef _LTI_MSC_6
00148         // MS Visual C++ 6 is not able to compile this...
00149         clustering::parameters::copy(other);
00150 #     else
00151         // ...so we have to use this workaround.
00152         // Conditional on that, copy may not be virtual.
00153         clustering::parameters& (clustering::parameters::* p_copy)
00154           (const clustering::parameters&) =
00155           clustering::parameters::copy;
00156         (this->*p_copy)(other);
00157 #    endif
00158         eps = other.eps;
00159         minPts = other.minPts;
00160   buckedSize = other.buckedSize;
00161         distanceMeasure = other.distanceMeasure;
00162         return *this;
00163       };
00164 
00165 
00166       /**
00167        * copy the contents of a parameters object
00168        * @param other the parameters object to be copied
00169        * @return a reference to this parameters object
00170        */
00171       parameters& operator=(const parameters& other){
00172     return copy(other);
00173       };
00174 
00175 
00176       /**
00177        * returns a pointer to a clone of the parameters
00178        */
00179       virtual classifier::parameters* clone() const{
00180     return new parameters(*this);
00181       };
00182 
00183       /**
00184        * write the parameters in the given ioHandler
00185        * @param handler the ioHandler to be used
00186        * @param complete if true (the default) the enclosing begin/end will
00187        *        be also written, otherwise only the data block will be written.
00188        * @return true if write was successful
00189        */
00190 # ifndef _LTI_MSC_6
00191       bool write(ioHandler& handler,const bool complete=true) const
00192 # else
00193       bool writeMS(ioHandler& handler,const bool complete=true) const
00194 # endif
00195         {
00196           bool b = true;
00197           if (complete) {
00198             b = handler.writeBegin();
00199           }
00200           
00201           if (b) {
00202             lti::write(handler,"eps",eps);
00203             lti::write(handler,"minPts",minPts);
00204             lti::write(handler,"buckedSize",buckedSize);
00205             if (distanceMeasure==L1Distance) {
00206               lti::write(handler,"distanceMeasure","L1Distance");
00207             } else if (distanceMeasure==L2Distance) {
00208               lti::write(handler,"distanceMeasure","L2Distance");
00209             } else {
00210               lti::write(handler,"distanceMeasure","L2SquareDistance");
00211             }
00212           }
00213           
00214 #     ifndef _LTI_MSC_6
00215           // This is the standard C++ code, which MS Visual C++ 6 is not able to
00216           // compile...
00217           b = b && clustering::parameters::write(handler,false);
00218 #     else
00219           bool (clustering::parameters::* p_writeMS)(ioHandler&,
00220                                                     const bool) const =
00221       clustering::parameters::writeMS;
00222           b = b && (this->*p_writeMS)(handler,false);
00223 #     endif
00224           
00225           if (complete) {
00226             b = b && handler.writeEnd();
00227           }
00228           
00229           return b;
00230         }
00231       
00232 #     ifdef _LTI_MSC_6
00233       virtual bool write(ioHandler& handler,
00234                          const bool complete = true) const {
00235         // ...we need this workaround to cope with another really
00236         // awful MSVC bug.
00237         return writeMS(handler,complete);
00238       }
00239 #     endif
00240 
00241 
00242 
00243       /**
00244        * read the parameters from the given ioHandler
00245        * @param handler the ioHandler to be used
00246        * @param complete if true (the default) the enclosing begin/end will
00247        *        be also written, otherwise only the data block will be written.
00248        * @return true if write was successful
00249        */
00250 #     ifndef _LTI_MSC_6
00251       virtual bool read(ioHandler& handler,const bool complete = true)
00252 #     else
00253       bool readMS(ioHandler& handler,const bool complete=true)
00254 #     endif
00255       {
00256         bool b = true;
00257         if (complete) {
00258           b = handler.readBegin();
00259         }
00260 
00261         if (b) {
00262           lti::read(handler,"eps",eps);
00263           lti::read(handler,"minPts",minPts);
00264     lti::read(handler,"buckedSize",buckedSize);
00265           std::string tmpStr;
00266           lti::read(handler,"distanceMeasure",tmpStr);
00267           if (tmpStr=="L1Distance") {
00268             distanceMeasure=L1Distance;
00269           } else if (tmpStr=="L2Distance") {
00270             distanceMeasure=L2Distance;
00271           } else {
00272             distanceMeasure=L2SquareDistance;
00273           }
00274 
00275         }
00276 
00277 #     ifndef _LTI_MSC_6
00278         // This is the standard C++ code, which MS Visual C++ 6 is not able to
00279         // compile...
00280         b = b && clustering::parameters::read(handler,false);
00281 #     else
00282         bool (clustering::parameters::* p_readMS)(ioHandler&,
00283                                                         const bool) =
00284           clustering::parameters::readMS;
00285         b = b && (this->*p_readMS)(handler,false);
00286 #     endif
00287 
00288         if (complete) {
00289           b = b && handler.readEnd();
00290         }
00291 
00292         return b;
00293       }
00294 
00295 #     ifdef _LTI_MSC_6
00296       virtual bool read(ioHandler& handler,const bool complete=true) {
00297         // ...we need this workaround to cope with another really awful MSVC
00298         // bug.
00299         return readMS(handler,complete);
00300       }
00301 #      endif
00302 
00303 
00304 
00305 //#     ifdef _LTI_MSC_6
00306       /**
00307        * this function is required by MSVC only, as a workaround for a
00308        * very awful bug, which exists since MSVC V.4.0, and still by
00309        * V.6.0 with all bugfixes (so called "service packs") remains
00310        * there...  This method is also public due to another bug, so please
00311        * NEVER EVER call this method directly: use read() instead
00312        */
00313 //      bool readMS(ioHandler& handler,const bool complete=true);
00314 
00315       /**
00316        * this function is required by MSVC only, as a workaround for a
00317        * very awful bug, which exists since MSVC V.4.0, and still by
00318        * V.6.0 with all bugfixes (so called "service packs") remains
00319        * there...  This method is also public due to another bug, so please
00320        * NEVER EVER call this method directly: use write() instead
00321        */
00322 //      bool writeMS(ioHandler& handler,const bool complete=true) const;
00323 //#     endif
00324 
00325       // ------------------------------------------------
00326       // the parameters
00327       // ------------------------------------------------
00328 
00329       enum eDistanceType {
00330         L1Distance,
00331         L2Distance,
00332         L2SquareDistance
00333       };
00334       
00335       /**
00336        * Define eps region for a point, where neighbour points must be in
00337        */
00338       double eps;
00339       
00340       /**
00341        * Number of points which have to be in the eps-neighbourhood
00342        * otherwise the points are defined as noise
00343        */
00344       int minPts;
00345       
00346       /**
00347        * number of elements stored in each leaf of the kd-tree,
00348        * which is used als internal datastructure of DBScan
00349        */
00350       int buckedSize;
00351 
00352       /**
00353        * The kind of distance measurement that is used for computing the
00354        * distances between the data points.
00355        * Values are: l1Distance, l2Distance, l2SquareDistance
00356        */
00357       eDistanceType distanceMeasure;
00358 
00359     };
00360       
00361     //--------------------------------------------------------
00362     //       class DBSCan
00363     //--------------------------------------------------------
00364 
00365     /**
00366      * default constructor
00367      * reads and sets the parameters
00368      */
00369     DBScan();
00370 
00371     /**
00372      * copy constructor
00373      * @param other the object to be copied
00374      */
00375     DBScan(const DBScan& other);
00376 
00377     /**
00378      * destructor
00379      */
00380     virtual ~DBScan();
00381 
00382     /**
00383      * returns the name of this type ("DBScan")
00384      */
00385     virtual const char* getTypeName() const;
00386 
00387     /**
00388      * copy data of "other" clustering.
00389      * @param other the clustering to be copied
00390      * @return a reference to this clustering object
00391      */
00392     DBScan& copy(const DBScan& other);
00393 
00394     /**
00395      * alias for copy member
00396      * @param other the clustering to be copied
00397      * @return a reference to this clustering object
00398      */
00399     DBScan& operator=(const DBScan& other);
00400 
00401     /**
00402      * returns a pointer to a clone of this clustering.
00403      */
00404     virtual classifier* clone() const;
00405 
00406     /**
00407      * returns used parameters
00408      */
00409     const parameters& getParameters() const;
00410 
00411     /**
00412      * Unsupervised training.  The vectors in the <code>input</code>
00413      * matrix will be put into groups according to the training
00414      * algorithm.  Additionally, an integer indicating the class each
00415      * point belongs to is returned. <p> By default this method uses
00416      * the other train method train(const dmatrix&) and then calls
00417      * classify(const dvector&,outputVector&) to get the ids for each
00418      * train vector. These ids are then returned.
00419      * Train will return false, if no cluster is found and all points are 
00420      * marked as noise!
00421      * @param input the matrix with the input vectors (each row is a training
00422      *              vector)
00423      * @param ids vector of class ids for each input point
00424      * @return true if successful, false otherwise. (if false you can check
00425      *              the error message with getStatusString())
00426      */
00427     virtual bool train(const dmatrix& input,
00428                        ivector& ids);
00429 
00430     /**
00431      * Unsupervised training.
00432      * The vectors in the <code>input</code> matrix
00433      * will be clustered using each specific method.
00434      * Train will return false, if no cluster is found.
00435      * @param input the matrix with the input vectors (each row is a training
00436      *              vector)
00437      * @return true if successful, false otherwise. (if false you can check
00438      *              the error message with getStatusString())
00439      */
00440     virtual bool train(const dmatrix& input);
00441 
00442     //TODO Check whether you really need a new classify method.
00443     // In some cases the superclasses method will suffice. Then just
00444     // delete the declaration and its implementation stump.
00445   
00446     /** classify
00447      * @param feature vector to be classified
00448      * @param result result as described above
00449      * @return true if successful, false otherwise
00450      */
00451     virtual bool classify(const dvector& feature, outputVector& result) const;
00452     
00453 
00454 
00455 #ifdef _LTI_DEBUG
00456     /**
00457      * showClustering for 2D points
00458      */
00459     void showClustering(typename kdTree< vector<double>,int,T >::node* nPtr,
00460                         kdTree< vector<double>,int,T >& Tree);
00461 
00462 #endif
00463 
00464   protected:
00465 
00466     // typedefs
00467     typedef kdTree< vector<double>,int,T > dKdTree;
00468 
00469     typedef typename dKdTree::node dNode;
00470 
00471     typedef typename dKdTree::element dElement;
00472     
00473     /**
00474      * DBScan algorithm working with the kdTree as data struckture
00475      * @param setOfPoints a kd-Tree containing the data to be clustered
00476      */
00477     virtual bool dbScan(dKdTree& setOfPoints);
00478 
00479     /**
00480      * The dbScan algorithm. 
00481      * walks rekursively through the tree, if one point ist found, the
00482      * clustering is started
00483      */
00484     void cluster(dNode* nPtr, 
00485                  dKdTree* setOfPoints);
00486     
00487     /**
00488      * expandCluster tries to expand the cluster found, with respect of
00489      * eps, minPts, density-reachability and desity-connectivity
00490      * @param setOfPoints pointer to the used kd-Tree
00491      * @param point one point/element of the data stored in the kd-Tree
00492      * @param clusterID the ID of the current cluster
00493      * returns true if the cluster could be expanded
00494      */
00495     bool expandCluster(dKdTree* setOfPoints, 
00496            dElement* point,
00497            int clusterID);
00498     
00499     /**
00500      * check if elements of node have been clustered, if the node is a leaf
00501      * returns true if all elements are clusterd, else false
00502      */
00503     bool allElementsClustered(dNode* nPtr);
00504     
00505     /**
00506      * eps neighbourhood
00507      */
00508     double eps;
00509     
00510     /**
00511      * minPts
00512      */
00513     int minPts;
00514     
00515     /**
00516      * clusterID
00517      */
00518     int clusterID;
00519 
00520     /**
00521      * kd-tree used for effective search on elements
00522      * internal used data strukture
00523      */
00524     dKdTree setOfPoints;
00525 
00526     /**
00527      * bucked-size of the kd-tree
00528      * number of elements stored in one leaf
00529      * default value: 2
00530      */
00531     int buckedSize;
00532 
00533   private:
00534 
00535     /**
00536      * iterate over setOfPoints and add all points belonging to one
00537      * cluster to the same kd-Tree in kdClusterVec
00538      *
00539      */
00540     void buildKdClusterVec(dNode* nPtr);
00541     
00542 
00543     /**
00544      * vector of kd-trees, each kd-tree represents one cluster
00545      * used for classification of points
00546      * the vector is initialized after the clustering
00547      */
00548     std::vector<dKdTree*> kdClusterVec;
00549 
00550     /**
00551      * vector of kdTree::elements one element of each cluster
00552      */
00553     std::vector<dElement*> startElements;
00554 
00555     /**
00556      * minimal distance to other clusters for each cluster
00557      * the vector index corresponds to the clusterID-1
00558      */
00559     vector<double> minDistances;
00560   };
00561 }
00562    
00563 #include "ltiUndebug.h"
00564 
00565 
00566 #endif
00567 
00568     

Generated on Sat Apr 10 15:25:21 2010 for LTI-Lib by Doxygen 1.6.1