Added quantiling UDAFs
[com/gs-lite.git] / src / ftacmp / parse_schema.cc
1 /* ------------------------------------------------\r
2 Copyright 2014 AT&T Intellectual Property\r
3    Licensed under the Apache License, Version 2.0 (the "License");\r
4    you may not use this file except in compliance with the License.\r
5    You may obtain a copy of the License at\r
6 \r
7      http://www.apache.org/licenses/LICENSE-2.0\r
8 \r
9    Unless required by applicable law or agreed to in writing, software\r
10    distributed under the License is distributed on an "AS IS" BASIS,\r
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
12    See the License for the specific language governing permissions and\r
13    limitations under the License.\r
14  ------------------------------------------- */\r
15 \r
16 #include <string>\r
17 #include"parse_fta.h"\r
18 #include "parse_schema.h"\r
19 #include "type_objects.h"\r
20 #include <stdio.h>\r
21 #include <stdlib.h>\r
22 // #include <algo.h>\r
23 #include<algorithm>\r
24 \r
25 using namespace std;\r
26 \r
27 table_list *Schema;\r
28 \r
29 table_def::table_def(const char *name, param_list *oprop, field_entry_list *fel,\r
30                         subqueryspec_list *ql, param_list *selp){\r
31         table_name =name;\r
32         fields = fel->get_list();\r
33         schema_type = OPERATOR_VIEW_SCHEMA;\r
34         qspec_list = ql->spec_list;\r
35 \r
36         if(oprop == NULL) op_properties = new param_list();\r
37         else            op_properties = oprop;\r
38         if(selp == NULL) selpush = new param_list();\r
39         else            selpush = selp;\r
40         base_tables = new param_list();\r
41 };\r
42 \r
43 table_def *table_def::make_shallow_copy(string n){\r
44         table_def *ret = new table_def();\r
45         ret->table_name = n;\r
46         ret->fields = fields;\r
47         ret->schema_type = schema_type;\r
48         ret->base_tables = base_tables;\r
49         ret->op_properties = op_properties;\r
50         ret->qspec_list = qspec_list;\r
51         ret->selpush = selpush;\r
52 \r
53         return ret;\r
54 }\r
55 \r
56 void table_def::mangle_subq_names(std::string mngl){\r
57         int i;\r
58         for(i=0;i<qspec_list.size();++i){\r
59                 subquery_spec *s = qspec_list[i]->duplicate();\r
60                 s->name += mngl;\r
61                 qspec_list[i] = s;\r
62         }\r
63 }\r
64 \r
65 \r
66 \r
67 bool table_def::contains_field(string f){\r
68   int i;\r
69 \r
70   for(i=0;i<fields.size();i++){\r
71         if(fields[i]->get_name() == f){\r
72                 return(true);\r
73         }\r
74   }\r
75   return(false);\r
76 \r
77 }\r
78 \r
79 int table_def::get_field_idx(std::string f){\r
80   int i;\r
81   for(i=0;i<fields.size();i++){\r
82         if(fields[i]->get_name() == f){\r
83                 return(i);\r
84         }\r
85   }\r
86   return(-1);\r
87 }\r
88 \r
89 \r
90 string table_def::get_type_name(std::string f){\r
91   int i;\r
92   for(i=0;i<fields.size();i++){\r
93         if(fields[i]->get_name() == f){\r
94                 return(fields[i]->get_type());\r
95         }\r
96   }\r
97   return("INTERNAL ERROR undefined field " + f);\r
98 }\r
99 \r
100 param_list *table_def::get_modifier_list(std::string f){\r
101   int i;\r
102   for(i=0;i<fields.size();i++){\r
103         if(fields[i]->get_name() == f){\r
104                 return(fields[i]->get_modifier_list());\r
105         }\r
106   }\r
107   fprintf(stderr,"INTERNAL ERROR, no field %s in table %s, call is get_modifier_list.\n",\r
108                         f.c_str(), table_name.c_str() );\r
109   exit(1);\r
110   return(NULL);\r
111 }\r
112 \r
113 \r
114 string table_def::get_fcn(std::string f){\r
115   int i;\r
116   for(i=0;i<fields.size();i++){\r
117         if(fields[i]->get_name() == f){\r
118                 return(fields[i]->get_fcn());\r
119         }\r
120   }\r
121   return("INTERNAL ERROR undefined field " + f);\r
122 \r
123 }\r
124 \r
125 \r
126 string table_def::get_field_basetable(std::string f){\r
127   int i;\r
128   for(i=0;i<fields.size();i++){\r
129         if(fields[i]->get_name() == f){\r
130                 return(fields[i]->get_basetable());\r
131         }\r
132   }\r
133   return("INTERNAL ERROR undefined field " + f);\r
134 \r
135 }\r
136 \r
137 int table_def::verify_no_duplicates(std::string &err){\r
138 \r
139         int f1, f2;\r
140         for(f1=0;f1<fields.size()-1;f1++){\r
141                 string f1_name = fields[f1]->get_name();\r
142                 for(f2=f1+1;f2<fields.size();f2++){\r
143                         if(f1_name == fields[f2]->get_name()){\r
144                                 err.append("Error, table ");\r
145                                 err.append(table_name);\r
146                                 err.append(" has a duplicate field :");\r
147                                 err.append(f1_name);\r
148                                 err.append("\n");\r
149                                 return(1);\r
150                         }\r
151                 }\r
152         }\r
153         return(0);\r
154 }\r
155 \r
156 int table_def::verify_access_fcns(std::string &err){\r
157         int retval = 0, f;\r
158 \r
159         for(f=0;f<fields.size();++f){\r
160                 if(fields[f]->get_fcn() == ""){\r
161                         err += "Error, PROTOCOL field "+table_name+"."+fields[f]->get_name()+" has an empty access function.\n";\r
162                         retval = 1;\r
163                 }\r
164         }\r
165 \r
166         return(retval);\r
167 }\r
168 \r
169 int table_def::add_field(field_entry *fe){\r
170         string fe_name = fe->get_name();\r
171         int f;\r
172 \r
173         for(f=0;f<fields.size();f++){\r
174                 if(fe_name == fields[f]->get_name()){\r
175                         return(-1);\r
176                 }\r
177         }\r
178         fields.push_back(fe);\r
179         return(0);\r
180 }\r
181 \r
182 \r
183 vector<string> table_list::get_table_names(){\r
184         vector<string> retval;\r
185         int i;\r
186         for(i=0;i<tbl_list.size();i++){\r
187                 retval.push_back(tbl_list[i]->get_tbl_name());\r
188         }\r
189         return(retval);\r
190 }\r
191 \r
192 \r
193 int table_list::find_tbl(string t){\r
194         int i;\r
195         for(i=0;i<tbl_list.size();i++)\r
196                 if(tbl_list[i]->get_tbl_name() == t)\r
197                         return(i);\r
198 //      fprintf(stderr,"INTERNAL ERROR: Could not find table %s in table_list::find_tbl\n",t.c_str());\r
199         return(-1);\r
200 }\r
201 \r
202 vector<field_entry *> table_list::get_fields(string t){\r
203         int pos = find_tbl(t);\r
204         if(pos<0){\r
205                 vector<field_entry *> r;\r
206                 return(r);\r
207         }\r
208         return(tbl_list[pos]->get_fields());\r
209 }\r
210 \r
211 field_entry *table_list::get_field(string t, int i){\r
212         int pos = find_tbl(t);\r
213         if(pos<0){\r
214                 return(NULL);\r
215         }\r
216         return(tbl_list[pos]->get_field(i));\r
217 }\r
218 \r
219 int table_list::get_field_idx(string t, string f){\r
220         int pos = find_tbl(t);\r
221         if(pos<0){\r
222                 return(-1);\r
223         }\r
224         return(tbl_list[pos]->get_field_idx(f));\r
225 }\r
226 \r
227 \r
228 vector<int> table_list::get_tblref_of_field(string f){\r
229         int i;\r
230         vector<int> retval;\r
231 \r
232         for(i=0;i<tbl_list.size();i++){\r
233                 if( tbl_list[i]->contains_field(f) ){\r
234                         retval.push_back(i);\r
235                 }\r
236         }\r
237         return(retval);\r
238 }\r
239 \r
240 //              TODO: this seems to duplicate find_tbl\r
241 int table_list::get_table_ref(string t){\r
242         int i;\r
243         for(i=0;i<tbl_list.size();i++){\r
244                 if(tbl_list[i]->get_tbl_name() == t ){\r
245                         return(i);\r
246                 }\r
247         }\r
248         return(-1);\r
249 }\r
250 \r
251 \r
252 //                              Use to unroll hierarchically defined\r
253 //                              tables.  Used for source tables.\r
254 //                              Also, do some sanity checking, better\r
255 //                              to find the errors now when its easy to report\r
256 //                              than later when its obscure.\r
257 //\r
258 //                              Also, process the unpacking functions.\r
259 //                              and verify that all field unpacking functions\r
260 //                              are listed in the schema.\r
261 int table_list::unroll_tables(string &err){\r
262 //              First, verify there are no repeat field names in any\r
263 //              of the tables.\r
264 \r
265         int f, tref, p, t, ret, retval;\r
266 \r
267         for(t=0;t<tbl_list.size();t++){\r
268           if(tbl_list[t]->get_schema_type() == UNPACK_FCNS_SCHEMA){\r
269                 for(f=0;f<tbl_list[t]->ufcn_list.size();f++){\r
270                         ufcn_fcn[ tbl_list[t]->ufcn_list[f]->name ] = tbl_list[t]->ufcn_list[f]->fcn;\r
271                         ufcn_cost[ tbl_list[t]->ufcn_list[f]->name ] = tbl_list[t]->ufcn_list[f]->cost;\r
272                         \r
273                 }\r
274           }\r
275         }\r
276 \r
277         for(t=0;t<tbl_list.size();t++){\r
278           if(tbl_list[t]->get_schema_type() != UNPACK_FCNS_SCHEMA){\r
279 //                      No duplicate field names\r
280                 ret = tbl_list[t]->verify_no_duplicates(err);\r
281                 if(ret) retval = ret;\r
282 \r
283 //                      every field has an access function\r
284                 if(tbl_list[t]->get_schema_type() == PROTOCOL_SCHEMA){\r
285                         retval = tbl_list[t]->verify_access_fcns(err);\r
286                         if(ret) retval = ret;\r
287                 }\r
288 \r
289 //                      Every type can be parsed\r
290                 vector<field_entry *> flds = tbl_list[t]->get_fields();\r
291                 for(f=0;f<flds.size();++f){\r
292                         data_type dt(flds[f]->get_type());\r
293                         if(dt.get_type() == undefined_t){\r
294                                 err += "ERROR, field "+flds[f]->get_name()+" of table "+tbl_list[t]->get_tbl_name()+" has unrecognized type "+flds[f]->get_type()+"\n";\r
295                                 retval = 1;\r
296                         }\r
297                         if(dt.get_type() == fstring_t){\r
298                                 err += "ERROR, field "+flds[f]->get_name()+" of table "+tbl_list[t]->get_tbl_name()+" has unsupported type "+flds[f]->get_type()+"\n";\r
299                                 retval = 1;\r
300                         }\r
301                 }\r
302 \r
303 //                      Ensure that the unpack functions, if any, exist.\r
304                 for(f=0;f<flds.size();++f){\r
305                         set<string> ufcns = flds[f]->get_unpack_fcns();\r
306                         set<string>::iterator ssi;\r
307                         for(ssi=ufcns.begin();ssi!=ufcns.end();ssi++){\r
308                                 if(ufcn_fcn.count((*ssi))==0){\r
309                                         err += "ERROR, field "+flds[f]->get_name()+" of table "+tbl_list[t]->get_tbl_name()+" has unrecognized unpacking function "+(*ssi)+"\n";\r
310                                         retval = 1;\r
311                                 }\r
312                         }\r
313                 }\r
314 \r
315 //                      annote the original source of the field -- for prefilter\r
316                 string tbl_name = tbl_list[t]->get_tbl_name();\r
317                 vector<field_entry *> fev = tbl_list[t]->get_fields();\r
318                 for(f=0;f<fev.size();++f)\r
319                         fev[f]->set_basetable(tbl_name);\r
320           }\r
321         }\r
322 \r
323         if(retval) return(retval);\r
324 \r
325 //              Next, build a predecessors graph.\r
326 //              Verify that all referenced tables exist.\r
327 \r
328         vector< vector<int> > predecessors;             // list of tables inherited from.\r
329         vector<int> n_pred;                                             // number of (remaining) predecessors.\r
330                                                                                         // -1 indicates a processed table.\r
331 \r
332         for(t=0;t<tbl_list.size();t++){\r
333           if(tbl_list[t]->get_schema_type() != UNPACK_FCNS_SCHEMA){\r
334                 vector<string> pred_tbls = tbl_list[t]->get_pred_tbls();\r
335                 vector<int> pred_ref;\r
336                 for(p=0;p<pred_tbls.size();p++){\r
337                         tref = this->get_table_ref(pred_tbls[p]);\r
338                         if(tref < 0){\r
339                                 err.append("Error: table ");\r
340                                 err.append(tbl_list[t]->get_tbl_name());\r
341                                 err.append(" referenced non-existent table ");\r
342                                 err.append(pred_tbls[p]);\r
343                                 err.append("\n");\r
344                                 return(2);\r
345                         }else{\r
346                                 pred_ref.push_back(tref);\r
347                         }\r
348                 }\r
349                 predecessors.push_back(pred_ref);\r
350                 n_pred.push_back(pred_ref.size());\r
351           }else{\r
352                 vector<int> tmp_iv;\r
353                 predecessors.push_back(tmp_iv);\r
354                 n_pred.push_back(0);\r
355           }\r
356         }\r
357 \r
358 \r
359         int n_remaining = predecessors.size();\r
360         int n_total = n_remaining;\r
361 \r
362 //              Run through the DAG and pull off one root at a time (n_pred == 0).\r
363 //              there might be a cycle, so iterate until n_remaining == 0.\r
364 \r
365         while(n_remaining > 0){\r
366 \r
367 //              Find a root\r
368                 int root;\r
369                 for(root=0;root < n_total;root++){\r
370                         if(n_pred[root] == 0)\r
371                                 break;\r
372                 }\r
373                 if(root == n_total){    // didn't find a root.\r
374                         err.append("Error : cycle in inheritance among the following tables:");\r
375                         int r;\r
376                         for(r=0;r<n_total;r++){\r
377                                 if(n_pred[r] > 0){\r
378                                         err.append(" ");\r
379                                         err.append(tbl_list[r]->get_tbl_name());\r
380                                 }\r
381                         }\r
382                         return(3);\r
383                 }\r
384 \r
385 //                      I'd adding fields from the root table to the\r
386                 vector<field_entry *> pred_fields = tbl_list[root]->get_fields();\r
387 \r
388 \r
389 //                      Scan for all successors of the root.\r
390                 int s, f;\r
391                 for(s=0;s<n_total;s++){\r
392                         if(find((predecessors[s]).begin(), (predecessors[s]).end(), root) !=\r
393                                         (predecessors[s]).end() ){\r
394 \r
395 //                      s is a successor : add the fields from the root.\r
396                                 for(f=0;f<pred_fields.size();f++){\r
397                                         retval = tbl_list[s]->add_field(pred_fields[f]);\r
398                                         if(retval < 0){\r
399                                                 err.append("Warning: field ");\r
400                                                 err.append(pred_fields[f]->get_name());\r
401                                                 err.append(" already exists in table ");\r
402                                                 err.append(tbl_list[s]->get_tbl_name());\r
403                                                 err.append(" (inheriting from table ");\r
404                                                 err.append(tbl_list[root]->get_tbl_name());\r
405                                                 err.append(").\n");\r
406                                         }\r
407                                 }\r
408 \r
409 //                      s has one less predecessor.\r
410                                 n_pred[s]--;\r
411                         }\r
412                 }\r
413 \r
414 //                      Indicate that the root has been processed.\r
415                 n_pred[root] = -1;\r
416                 n_remaining--;\r
417         }\r
418 \r
419 \r
420 //                      Done!\r
421         return(0);\r
422 }\r
423 \r
424 int table_list::add_table(table_def *td){\r
425         int tref = get_table_ref(td->get_tbl_name());\r
426         if(tref >= 0) return(tref);\r
427         tbl_list.push_back(td);\r
428         return(tbl_list.size() - 1);\r
429 }\r
430 \r
431 \r
432 \r
433 //////////////////////////////////////////////////////\r
434 //////////              Serialize functions.\r
435 //////////              The deserialize fcn is the parser.\r
436 \r
437 \r
438 string param_list::to_string(){\r
439         string retval;\r
440         map<string, string>::iterator mssi;\r
441         bool first_exec=true;\r
442         for(mssi=pmap.begin();mssi!=pmap.end();++mssi){\r
443                 if(first_exec){  first_exec=false; }\r
444                 else{ retval+=",";  }\r
445                 retval += (*mssi).first + " ";\r
446                 retval += (*mssi).second;\r
447         }\r
448         return(retval);\r
449 }\r
450 \r
451 string field_entry::to_string(){\r
452         string retval = type + " " + name + " " + function;\r
453         if(mod_list->size() > 0){\r
454                 retval += " ( " + mod_list->to_string() + " ) ";\r
455         }\r
456 \r
457         return(retval);\r
458 }\r
459 \r
460 \r
461 \r
462 \r
463 string table_def::to_string(){\r
464         int q;\r
465         string retval;\r
466         switch(schema_type){\r
467         case PROTOCOL_SCHEMA:\r
468                 retval = "TABLE ";\r
469                 break;\r
470         case STREAM_SCHEMA:\r
471                 retval = "STREAM ";\r
472                 break;\r
473         case OPERATOR_VIEW_SCHEMA:\r
474                 retval += "OPERATOR_VIEW ";\r
475                 break;\r
476         default:\r
477                 retval = "ERROR UNKNOWN TABLE TYPE ";\r
478                 break;\r
479         }\r
480 \r
481         retval += table_name + " ";\r
482 \r
483         if(base_tables->size() > 0){\r
484                 retval += "( "+base_tables->to_string() + " ) ";\r
485         }\r
486 \r
487         retval += "{\n";\r
488 \r
489         if(schema_type == OPERATOR_VIEW_SCHEMA){\r
490                 retval += "\tOPERATOR ("+op_properties->to_string()+")\n";\r
491                 retval += "\tFIELDS{\n";\r
492         }\r
493 \r
494         int f;\r
495         for(f=0;f<fields.size();f++){\r
496                 retval += "\t" + fields[f]->to_string() + ";\n";\r
497         }\r
498 \r
499         if(schema_type == OPERATOR_VIEW_SCHEMA){\r
500                 retval += "\tSUBQUERIES{\n";\r
501                 for(q=0;q<qspec_list.size();++q){\r
502                         if(q>0) retval += ";\n";\r
503                         retval += qspec_list[q]->to_string();\r
504                 }\r
505                 retval += "\t}\n";\r
506                 retval += "\tSELECTION_PUSHDOWN ("+selpush->to_string()+")\n";\r
507         }\r
508 \r
509         \r
510 \r
511         retval += "}\n";\r
512 \r
513         return(retval);\r
514 }\r
515 \r
516 string table_list::to_string(){\r
517         string retval;\r
518         int t;\r
519         for(t=0;t<tbl_list.size();t++){\r
520                 retval += tbl_list[t]->to_string();\r
521                 retval += "\n";\r
522         }\r
523         return(retval);\r
524 }\r