另类高效模糊检索之按时间片划区段分区检索

来源:互联网 发布:电信apn设置4g最快网络 编辑:程序博客网 时间:2024/06/05 20:44

     最近在做一个项目时遇到从数据集合中根据记录ID检索单个数据的问题,本来是采用for循环顺序查找的方式,但是发现如果数据集合比较大时效率就成了问题,本来也想过采用二分查找方式,后来经过仔细研究并结合数据特点(数据特点:id采用的是YYYYMMDD+10为随机数,addtime为数据添加实时时间,因此id与addtime中的YYYYMMDD肯定相等,不相等的几率几乎不存在),决定采用自主设计的按时间片划区段分区搜索的算法。主要思想就是:

 

    1.数据集按addtime降序排列。

    2.搜索时间范围是根据id得到YYYY-MM-DD,在这一天的0-24小时内添加的数据集内检索。

    3.设定单次搜索块的最大数,即block。   

 

    采用此思想搜索的效率比较:1<循环次数<=日添加数据量+数据集总数/block - 日添加数据量/block,如某数据集总数为1000,数据集尾部最后一天添加的数据量为20,单次搜索块为10,如此时检索的是最后一条数据,则循环次数为1000/10 - 20/10 +20=28次,而采用for循环的顺序查找循环次数则是1000;此处无法采用二分查找法,因为排序字段不是id,如采用二分查找法则需要根据字段id重新排序后查找,而排序耗费的资源也是很大的,在此处也不是理想的算法。此处可以根据日数据量设定block来提高检索效率,对于中小量的数据来说次此算法应该足以应付,大量及海量数据有待进一步研究。

          

            //获取数据集

            public static ObjectTable<Documentcontent> GetCollection(string _DocumentID)

            

            /// <summary>
            /// 获取内容
            /// </summary>
            /// <param name="_DocumentID">文档ID</param>
            /// <returns></returns>
            public static ObjectTable<Documentcontent> GetCollection(string _DocumentID)
            {
                string workSpaceID = Common.GetEditWorkSpaceID();

                //提供跨工作区调用数据源支持
                //workSpaceID = SiteHelper.GetObject.GetItem(workSpaceID).Datasource.Value;

                if (Convert.ToBoolean(ConfigurationManager.AppSettings[Enum_CacheItem.内容_Content]))
                {
                    if (!TestHashTableKey("DocumentContent" + workSpaceID + _DocumentID))
                    {
                        Cache(_DocumentID);
                    }

                    return (ObjectTable<Documentcontent>)hashTableCache["DocumentContent" + workSpaceID + _DocumentID];
                }
                else
                {
                    try
                    {
                        IDocumentcontentDAO dao = DAOManager.Default.GetDAO<IDocumentcontentDAO>();

                        QueryFilter filter = new QueryFilter();

                        if (!isShowDisabled)
                        {
                            filter.Items.Add(new DBField("status", DataType.String, DBOperator.Equal, "0"));
                        }

                        filter.Items.Add(new DBField("documentid", DataType.String, DBOperator.Equal, _DocumentID));

                        using (IDBSession session = DBSessionManager.Default.GetSession(Common.GetEditWorkSpaceDBConnName()))
                        {
                            return dao.SelectList(session, "*", filter, "addtime desc");
                        }
                    }
                    catch (Exception ex)
                    {
                        throw ex;
                    }
                }
            }

 

            //根据记录id在数据集中查找数据

            public static Documentcontent GetItem(string _DocumentID, string _DocumentContentID)

          

            /// <summary>
            /// 根据内容ID获取一个内容
            /// </summary>
            /// <param name="_DocumentContentID">内容ID</param>
            /// <param name="_DocumentID">文档ID</param>
            /// <returns></returns>
            public static Documentcontent GetItem(string _DocumentID, string _DocumentContentID)
            {
                Documentcontent item = null;

                ObjectTable<Documentcontent> lists = GetCollection(_DocumentID);

                //如果记录总数大于10则启用模糊查找
                if (lists.Count >= 10)
                {
                    #region 模糊搜索

                    DateTime _datetime = Convert.ToDateTime(_DocumentContentID.Substring(0, 8).Insert(4, "/").Insert(7, "/"));

                    int sum = lists.Count;

                    //分块大小
                    int block = 10;

                    //分块数
                    int pos = sum / block;

                    //不完整分块
                    int res = sum - pos * block;

                    //搜索分块次数
                    int scount = 0;

                    TimeSpan s_timespan = _datetime.AddDays(1) - lists[0].Addtime.Value;

                    TimeSpan e_timespan = _datetime.AddDays(1) - lists[sum - 1].Addtime.Value;

                    if ((s_timespan.Days > 0) || (e_timespan.Days < 0))//超出列表时间界限
                    {
                        item = null;
                    }
                    else//未超出列表时间界限
                    {
                        for (int i = 0; i < pos; i++)
                        {
                            //scount=0:第一次进入分块查询
                            //scount=1:第二次以上进入查询且当前块的尾部时间<=_datetime
                            //scount=2:退出查询
                            if (scount == 0 || scount == 1)
                            {
                                //lists[(i + 1) * block - 1].Addtime.Value <= _datetime.AddDays(1):当前分块尾部时间大于等于_datetime.AddDays(1)  
                                if ((lists[(i + 1) * block - 1].Addtime.Value <= _datetime.AddDays(1)))
                                {
                                    for (int j = 0; j < block; j++)
                                    {
                                        if (lists[i * block + j].ID.Value == _DocumentContentID)
                                        {
                                            item = lists[i * block + j];

                                            break;
                                        }
                                    }

                                    //在当前分块中未查找到,进入下一分块继续查找,因为当前分块时间点可能在0点到23点59分之间,而_datetime为0点时间,估紧邻的两个分块都有可能是目标搜索块
                                    if (item == null)
                                    {
                                        if (lists[(i + 1) * block - 1].Addtime.Value > _datetime.AddDays(1) && scount == 1)
                                        {
                                            //标示当前块尾部时间已经大于_datetime.AddDays(1),不属于当天的记录,不必再进入下一分块查询
                                            scount = 2;
                                        }
                                        else
                                        {
                                            //进入下一分块查询
                                            scount = 1;
                                        }

                                        continue;
                                    }
                                    else//在当前分块中已经找到,直接退出查找
                                    {
                                        break;
                                    }
                                }
                                else//前一分块时间小于_datetime.AddDays(1)
                                {
                                    continue;
                                }
                            }
                            else
                            {
                                //scount=2:在当前分块中未查找到,且当前分开尾部时间已经小于_datetime.AddDays(1),直接退出查找
                                break;
                            }
                        }

                        //未查到、不完整分块不为零且当前分块尾部时间已经>=_datetime.AddDays(1)
                        if (item == null && res != 0 && scount != 2)
                        {
                            for (int i = pos * block; i < sum; i++)
                            {
                                if (lists[i].ID.Value == _DocumentContentID)
                                {
                                    item = lists[i];

                                    break;
                                }
                            }
                        }
                    }

                    #endregion
                }
                else
                {
                    #region 顺序查找

                    for (int i = 0; i < lists.Count; i++)
                    {
                        if (lists[i].ID.Value == _DocumentContentID)
                        {
                            item = lists[i];

                            break;
                        }
                    }

                    #endregion
                }

                return item;
            }

原创粉丝点击