1010#include "postgres.h"
1111
1212#include "catalog/namespace.h"
13+ #include "catalog/pg_statistic.h"
1314#include "catalog/pg_type.h"
1415#include "utils/builtins.h"
1516#include "utils/lsyscache.h"
17+ #include "utils/memutils.h"
18+ #include "utils/syscache.h"
1619#include "utils/varlena.h"
1720
1821#include "rum.h"
1922
20- char * TFIDFSource ;
23+ /* lookup table type for binary searching through MCELEMs */
24+ typedef struct
25+ {
26+ text * element ;
27+ float4 frequency ;
28+ } TextFreq ;
29+
30+ /* type of keys for bsearch'ing through an array of TextFreqs */
31+ typedef struct
32+ {
33+ char * lexeme ;
34+ int length ;
35+ } LexemeKey ;
36+
37+ typedef struct
38+ {
39+ TextFreq * lookup ;
40+ int nmcelem ;
41+ float4 minfreq ;
42+ } MCelemStats ;
43+
44+ typedef struct
45+ {
46+ Oid relId ;
47+ AttrNumber attrno ;
48+ } RelAttrInfo ;
49+
50+ char * TFIDFSource ;
51+ static RelAttrInfo TFIDFSourceParsed ;
52+ static bool TDIDFLoaded = false;
53+ static MemoryContext TFIDFContext = NULL ;
54+ static MCelemStats TDIDFStats ;
2155
2256#define EXIT_CHECK_TF_IDF_SOURCE (error ) \
2357 do { \
@@ -29,18 +63,24 @@ char *TFIDFSource;
2963 return false; \
3064 } while (false);
3165
66+ static void load_tf_idf_source (void );
67+ static void check_load_tf_idf_source (void );
68+ static void forget_tf_idf_stats (void );
69+ static int compare_lexeme_textfreq (const void * e1 , const void * e2 );
70+
3271bool
3372check_tf_idf_source (char * * newval , void * * extra , GucSource source )
3473{
35- char * rawname ;
36- char * attname ;
37- List * namelist ;
38- Oid namespaceId ;
39- Oid relId ;
40- Relation rel = NULL ;
41- TupleDesc tupDesc ;
42- AttrNumber attrno ;
43- int i ;
74+ char * rawname ;
75+ char * attname ;
76+ List * namelist ;
77+ Oid namespaceId ;
78+ Oid relId ;
79+ Relation rel = NULL ;
80+ TupleDesc tupDesc ;
81+ AttrNumber attrno ;
82+ int i ;
83+ RelAttrInfo * myextra ;
4484
4585 /* Need a modifiable copy of string */
4686 rawname = pstrdup (* newval );
@@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
107147 if (tupDesc -> attrs [attrno - 1 ]-> atttypid != TSVECTOROID )
108148 EXIT_CHECK_TF_IDF_SOURCE ("attribute should be of tsvector type" );
109149
150+ myextra = (RelAttrInfo * ) malloc (sizeof (RelAttrInfo ));
151+ myextra -> relId = relId ;
152+ myextra -> attrno = attrno ;
153+ * extra = (void * ) myextra ;
154+
110155 pfree (rawname );
111156 list_free (namelist );
112157 RelationClose (rel );
@@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
117162void
118163assign_tf_idf_source (const char * newval , void * extra )
119164{
165+ RelAttrInfo * myextra = (RelAttrInfo * ) extra ;
166+
167+ TFIDFSourceParsed = * myextra ;
168+ forget_tf_idf_stats ();
169+ }
170+
171+ static void
172+ load_tf_idf_source (void )
173+ {
174+ HeapTuple statsTuple ;
175+ AttStatsSlot sslot ;
176+ MemoryContext oldContext ;
177+ int i ;
178+
179+ if (!TFIDFContext )
180+ TFIDFContext = AllocSetContextCreate (TopMemoryContext ,
181+ "Memory context for TF/IDF statistics" ,
182+ ALLOCSET_DEFAULT_SIZES );
183+
184+ statsTuple = SearchSysCache3 (STATRELATTINH ,
185+ ObjectIdGetDatum (TFIDFSourceParsed .relId ),
186+ Int16GetDatum (TFIDFSourceParsed .attrno ),
187+ BoolGetDatum (true));
188+
189+ if (!statsTuple )
190+ statsTuple = SearchSysCache3 (STATRELATTINH ,
191+ ObjectIdGetDatum (TFIDFSourceParsed .relId ),
192+ Int16GetDatum (TFIDFSourceParsed .attrno ),
193+ BoolGetDatum (false));
194+
195+ MemoryContextReset (TFIDFContext );
196+ TDIDFLoaded = false;
197+
198+ oldContext = MemoryContextSwitchTo (TFIDFContext );
199+
200+ if (!statsTuple
201+ || !get_attstatsslot (& sslot , statsTuple ,
202+ STATISTIC_KIND_MCELEM , InvalidOid ,
203+ ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS )
204+ || sslot .nnumbers != sslot .nvalues + 2 )
205+ {
206+ ereport (ERROR ,
207+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
208+ errmsg ("statistics for TD/IDF is not found" ),
209+ errhint ("consider running ANALYZE" )));
210+ }
211+
212+ TDIDFStats .nmcelem = sslot .nvalues ;
213+ TDIDFStats .minfreq = sslot .numbers [sslot .nnumbers - 2 ];
214+ /*
215+ * Transpose the data into a single array so we can use bsearch().
216+ */
217+ TDIDFStats .lookup = (TextFreq * ) palloc (sizeof (TextFreq ) * TDIDFStats .nmcelem );
218+ for (i = 0 ; i < TDIDFStats .nmcelem ; i ++ )
219+ {
220+ /*
221+ * The text Datums came from an array, so it cannot be compressed or
222+ * stored out-of-line -- it's safe to use VARSIZE_ANY*.
223+ */
224+ Assert (!VARATT_IS_COMPRESSED (sslot .values [i ]) && !VARATT_IS_EXTERNAL (sslot .values [i ]));
225+ TDIDFStats .lookup [i ].element = (text * ) DatumGetPointer (sslot .values [i ]);
226+ TDIDFStats .lookup [i ].frequency = sslot .numbers [i ];
227+ }
120228
121- }
229+ MemoryContextSwitchTo (oldContext );
230+
231+ ReleaseSysCache (statsTuple );
232+ }
233+
234+ static void
235+ check_load_tf_idf_source (void )
236+ {
237+ if (!TDIDFLoaded )
238+ load_tf_idf_source ();
239+ }
240+
241+ static void
242+ forget_tf_idf_stats (void )
243+ {
244+ MemoryContextReset (TFIDFContext );
245+ TDIDFLoaded = false;
246+ }
247+
248+ /*
249+ * bsearch() comparator for a lexeme (non-NULL terminated string with length)
250+ * and a TextFreq. Use length, then byte-for-byte comparison, because that's
251+ * how ANALYZE code sorted data before storing it in a statistic tuple.
252+ * See ts_typanalyze.c for details.
253+ */
254+ static int
255+ compare_lexeme_textfreq (const void * e1 , const void * e2 )
256+ {
257+ const LexemeKey * key = (const LexemeKey * ) e1 ;
258+ const TextFreq * t = (const TextFreq * ) e2 ;
259+ int len1 ,
260+ len2 ;
261+
262+ len1 = key -> length ;
263+ len2 = VARSIZE_ANY_EXHDR (t -> element );
264+
265+ /* Compare lengths first, possibly avoiding a strncmp call */
266+ if (len1 > len2 )
267+ return 1 ;
268+ else if (len1 < len2 )
269+ return -1 ;
270+
271+ /* Fall back on byte-for-byte comparison */
272+ return strncmp (key -> lexeme , VARDATA_ANY (t -> element ), len1 );
273+ }
274+
275+ float4
276+ estimate_idf (char * lexeme , int length )
277+ {
278+ TextFreq * searchres ;
279+ LexemeKey key ;
280+ float4 selec ;
281+
282+ check_load_tf_idf_source ();
283+
284+ key .lexeme = lexeme ;
285+ key .length = length ;
286+
287+ searchres = (TextFreq * ) bsearch (& key , TDIDFStats .lookup , TDIDFStats .nmcelem ,
288+ sizeof (TextFreq ),
289+ compare_lexeme_textfreq );
290+
291+ if (searchres )
292+ {
293+ /*
294+ * The element is in MCELEM. Return precise selectivity (or
295+ * at least as precise as ANALYZE could find out).
296+ */
297+ selec = searchres -> frequency ;
298+ }
299+ else
300+ {
301+ /*
302+ * The element is not in MCELEM. Punt, but assume that the
303+ * selectivity cannot be more than minfreq / 2.
304+ */
305+ selec = TDIDFStats .minfreq / 2 ;
306+ }
307+
308+ return 1.0f / selec ;
309+ }
0 commit comments