X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=src%2Flib%2Fgscphftaaux%2Fhfta_udaf.cc;fp=src%2Flib%2Fgscphftaaux%2Fhfta_udaf.cc;h=60ae7501eae769e9e08fe9d28d94700262ea92db;hb=87cd81502f23b63c980bbce93b6159379299c782;hp=7eef46de1935991e46fbecb6ea35925a7735b8cf;hpb=989d19428b3d21982b048cf256f625a8ca664c2e;p=com%2Fgs-lite.git diff --git a/src/lib/gscphftaaux/hfta_udaf.cc b/src/lib/gscphftaaux/hfta_udaf.cc index 7eef46d..60ae750 100644 --- a/src/lib/gscphftaaux/hfta_udaf.cc +++ b/src/lib/gscphftaaux/hfta_udaf.cc @@ -27,6 +27,7 @@ Copyright 2014 AT&T Intellectual Property #include #include "hfta_runtime_library.h" +#include"stringhash.h" #define max(a,b) ((a) > (b) ? (a) : (b)) @@ -924,4 +925,78 @@ gs_int64_t extr_running_sum_max(vstring *v){ return vs->max; } +// --------------------------------------------- +// Approximate count distinct. +// Rely on the minhashing approach. +// Currently HFTA-only +// Uses a 32-bit hash, tested up to 100,000,000 elements +// and it gave good results (within 7%) + + +#define COUNT_DISTINCT_NREPS 250 +#define COUNT_DISTINCT_MAX_STRING_LEN 200 // number of 4-byte words + +static Hash32bit2univID hids[COUNT_DISTINCT_NREPS]; +static int approx_count_distinct_udaf_initialized = 0; +struct approx_count_distinct_udaf_str{ + unsigned int mn[COUNT_DISTINCT_NREPS]; +}; + + +void approx_count_distinct_udaf_HFTA_AGGR_INIT_(gs_sp_t buf){ + approx_count_distinct_udaf_str *cd = (approx_count_distinct_udaf_str *)buf; + for(int i=0;imn[i]=4294967295; + if(approx_count_distinct_udaf_initialized==0){ + for(int i=0;ilength/4] = 0; + memcpy((char *)buffer, (char *)val->offset, min(val->length, 800)); + unsigned int len4 = val->length/4 + ((val->length&0x03)>0); + + for(int i=0; imn[i]) cd->mn[i] = h; + } +} +void running_approx_count_distinct_udaf_HFTA_AGGR_UPDATE_(gs_sp_t buf, vstring *val){ + approx_count_distinct_udaf_HFTA_AGGR_UPDATE_(buf, val); +} + +void approx_count_distinct_udaf_HFTA_AGGR_OUTPUT_(vstring *res, gs_sp_t buf){ + res->offset = (gs_p_t)buf; + res->length = sizeof(approx_count_distinct_udaf_str); + res->reserved = SHALLOW_COPY; +} +void running_approx_count_distinct_udaf_HFTA_AGGR_OUTPUT_(vstring *res, gs_sp_t buf){ + approx_count_distinct_udaf_HFTA_AGGR_OUTPUT_(res, buf); +} + +gs_float_t extr_approx_count_distinct(vstring *v){ + approx_count_distinct_udaf_str *cd = (approx_count_distinct_udaf_str *)(v->offset); + gs_float_t avg = 0.0; + for(int i=0;imn[i]; + } + avg /= COUNT_DISTINCT_NREPS; + gs_float_t est = (4294967295.0 / avg) - 1; + return est; +} +