2006-6-1 16:51
Arrayccq613
算法求助DBSCAN Matlab/C++
C++版本
using System;
using System.Collections;
namespace DBSCAN
{
/// <summary>
/// Cluster data using DBSCAN (Density-Based Spatical Clustering of Application with Noise) methed
/// See "Data Mining" for further information
/// </summary>
public sealed class DBSCAN
{
public ArrayList DataPoints = new ArrayList(128);
private ArrayList DP2DP;
private int m_Core_Num;
private int m_MinPts;
private double m_eps;
/// <summary>
/// Add DataPoint to DBSCAN module to cluster
/// </summary>
public void AddDataPoint(DataPoint dp)
{
DataPoints.Add(dp);
m_Core_Num = 0;
m_MinPts = 0;
m_eps = 0;
}
public void RemoveAllDataPoints()
{
DataPoints.Clear();
DP2DP.Clear();
m_Core_Num = 0;
m_MinPts = 0;
m_eps = 0;
}
public void ResetAllDataPointsState()
{
foreach(DataPoint dp in DataPoints)
{
dp.class_id = 0;
dp.core_tag = false;
dp.used_tag = false;
}
}
public void PrepareDBSCAN_Table()
{
int dp_count = DataPoints.Count;
DP2DP = new ArrayList(dp_count);
for(int i=0;i<dp_count;i++)
DP2DP.Add(new SortedList(dp_count)); // also include the point itself
SortedList sl;
DataPoint dp;
for(int i=0;i<dp_count;i++)
{
sl=(SortedList)DP2DP[i];
dp=(DataPoint)DataPoints[i];
for(int j=0;j<dp_count;j++)
{
double distance = dp.Distance((DataPoint)DataPoints[j]);
int n = 1;
while(sl.ContainsKey(distance) == true) // SortedList didn't support duplicate key
{
/* the statement:
// distance = distance - double.Epsilon;
didn't work for the POOL .NET double.GetHashCode() implement :-(
*/
distance = distance - 0.00000000000001*n;
n=n*2;
}
sl.Add(distance, DataPoints[j]);
}
}
}
public int BuildCorePoint(double eps, int MinPts)
{
ResetAllDataPointsState();
int core_num = 0;
SortedList sl;
DataPoint src_dp, des_dp;
for(int i=0;i<DataPoints.Count;i++)
{
sl=(SortedList)DP2DP[i];
des_dp=(DataPoint)sl.GetByIndex(MinPts);
src_dp=(DataPoint)DataPoints[i];
if(src_dp.Distance(des_dp)<eps)
{
src_dp.core_tag=true;
core_num++;
}
}
if(core_num>0)
{
m_Core_Num = core_num;
m_MinPts = MinPts;
m_eps = eps;
}
return core_num;
}
public void DBSCAN_Cluster()
{
DataPoint dp;
int current_class_id = 1;
for(int i=0;i<DataPoints.Count;i++)
{
dp=(DataPoint)DataPoints[i];
if(dp.used_tag==false && dp.core_tag==true)
{
dp.class_id = current_class_id;
dp.used_tag = true;
CorePointCluster(i, current_class_id);
current_class_id++;
}
}
}
private void CorePointCluster(int dp_pos, int core_class_id)
{
DataPoint src_dp, des_dp;
SortedList sl=(SortedList)DP2DP[dp_pos];
src_dp=(DataPoint)sl.GetByIndex(0);
int i=1;
des_dp=(DataPoint)sl.GetByIndex(i);
while(src_dp.Distance(des_dp)<m_eps)
{
if(des_dp.used_tag == false)
{
des_dp.class_id = core_class_id;
des_dp.used_tag = true;
if(des_dp.core_tag == true)
CorePointCluster(DataPoints.IndexOf(des_dp),core_class_id);
}
i++;
try
{
des_dp=(DataPoint)sl.GetByIndex(i);
}
catch( ArgumentOutOfRangeException )
{
// To avoid eps is too large that out of index
return;
}
}
}
}
/// <summary>
/// DBSCAN DataPoint
/// </summary>
public class DataPoint
{
public bool core_tag = false;
public int class_id = 0; // 0 indicate NOISE
public bool used_tag = false;
public double d1; // dimension x-axis
public double d2; // dimension y-axis
// dimension n (n>=3) can be extend by inherient this class
// and reimplement following two method.
public DataPoint(double x, double y)
{
d1=x;
d2=y;
}
public double Distance(DataPoint dp)
{
if(this != dp)
{
double d1sq=(d1-dp.d1)*(d1-dp.d1);
double d2sq=(d2-dp.d2)*(d2-dp.d2);
return Math.Sqrt( d1sq + d2sq );
}
else
return 0;
}
}
}
Matlab版本
function [Clusters]=realDBSCAN(x,y,Eps,MinPts)
%MinPts实际上是书p242页的MinPts+1,因为计一个对象的Eps临域内的点时,把该对象计算在内了。
%输出: Clusters,每一行代表一个簇,形式为簇的对象对应的原数据集的ID
%ObjSet: 代表数据集,存在第i个对象则相应的ObjSet(i)=1
DtNb=length(x); %DtNb为数据点的个数
Clusters=[]; %聚类结果初始化为“空”
ObjSet=linspace(1,1,DtNb); %得到一个长度为DtNb的数组,所有对象设置为1(它用来表示数据集中还没有被分入任何类的对象的集合)
for ObjID=1 : 1 : DtNb %遍历数据集(以每个对象为初始对象聚类,效率很低的方法)
if ObjSet(ObjID)==1 %如果数据集中有此对象则搜索其所在的簇,结果为簇的对象的ID的无序排列
OneCluster=[]; %OneCluster表示一次聚类的结果,初始化为“空”
OneCluster=SearchCluster(ObjID,ObjSet,x,y,Eps,MinPts); %一次搜索的结果
if not(isempty(OneCluster)) %如果簇不为空则添加入簇组,同时从数据集中删除相应对象(即:如果以一个对象为初始对象搜索得到的簇不为空,则将该簇添加到Clusters中,并且把ObjSet中的该簇中所有对象对应的数组元素置0,即删除。)
[Clusters,ObjSet]=PutInCluster(Clusters,OneCluster,ObjSet);
end
end
end
end
function [MySet]=SearchCluster(ObjID,ObjSet,x,y,Eps,MinPts)
MySet=[ObjID];
pt=1;
while pt<=length(MySet) %只要没有到达聚类的尾部就继续(即聚类停止增长前)
id=MySet(pt);
x0=x(id);
y0=y(id);
EpsRange=[];
for i=1:1:length(x) %在整个数据集中查找仍然在(没有被其他聚类吸收)的对象
if ObjSet(i)==1
if (sqrt((x(i)-x0)^2+(y(i)-y0)^2)<=Eps)
EpsRange=[EpsRange i];
end
end
end
if length(EpsRange)>=MinPts
MySet=MyAppend(MySet,EpsRange);
end
pt=pt+1;
end
if length(MySet)<MinPts %如果簇的元素少于MinPts即是没有产生簇,所以返回“空”
MySet=[];
end
end
function [AplusB]=MyAppend(A,B)
%Append elements of B to the end of A,which was
AplusB=A;
t=0;
for i=1:length(B)
t=B(i);
flag=0;
for j=1:length(A)
if t==A(j)
flag=1; %when find the same: flag=1
break
end
end
if flag==0 %if not in A then append it to the end of A
AplusB=[AplusB t];
end
end
end
function [C,O]=PutInCluster(A,B,O)
%将新的簇B 放入簇组A中,更新数据集O
%OneCluster形式为[3 7 1 ...]对应的 Clusters中的相应行形式为[1 0 1 0 0 0 1 ...]
if length(A)==0 %簇组为空时添加入第一行
i=1;
else
i=length(A(:,1))+1; %簇组不为空时,添加入新行i
end
A(i, : )=linspace(0,0,length(O));
for j=1 : 1 :length(B)
t=B(j);
A(i,t)=1;
end
C=A;
O=~sum(C,1); %取反即可得到剩余的对象
end
function [NewObjSet]=ReSetObjSet(ObjSet,Clusters)
Lobj=length(ObjSet);
Lclu=length(Clusters);
tmp1=sum(Clusters,2);
if Lclu<Lobj
tmp2=linspace(0,0,Lobj-Lclu);
tmp1=[tmp1 tmp2];
end
NewObjSet=xor(ObjSet,tmp1);
end
希望各位大哥给予调试,成功有酬谢!QQ68135805