1 /*************************************************************************
\r
3 * Licensed under the Apache License, Version 2.0 (the "License");
\r
4 * you may not use this file except in compliance with the License.
\r
5 * You may obtain a copy of the License at
\r
7 * http://www.apache.org/licenses/LICENSE-2.0
\r
9 * Unless required by applicable law or agreed to in writing, software
\r
10 * distributed under the License is distributed on an "AS IS" BASIS,
\r
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
\r
12 * See the License for the specific language governing permissions and
\r
13 * limitations under the License.
\r
14 ***************************************************************************/
\r
16 #include <algorithm>
\r
20 #include "regxstring.h"
\r
21 #include "regxstring_impl.h"
\r
24 # include <iostream>
\r
26 static void printRefs(__DZ_OSTRINGSTREAM & oss,const REGXSTRING_NS::__Refs & refs)
\r
28 for(REGXSTRING_NS::__Refs::const_iterator i = refs.begin();i != refs.end();++i)
\r
29 std::cout<<"\t"<<oss.str().substr(i->first,i->second);
\r
32 # define _OSS_OUT(msg) { \
\r
33 std::cout<<msg<<" : "<<gdata.oss_.str(); \
\r
34 printRefs(gdata.oss_,gdata.refs_); \
\r
35 std::cout<<std::endl;}
\r
37 # define _OSS_OUT(str)
\r
42 // Replacements for new and delete
\r
45 T * ret = __DZ_ALLOC<T>().allocate(1);
\r
49 template<class T,class A>
\r
50 T * New(const A & a){
\r
51 T * ret = __DZ_ALLOC<T>().allocate(1);
\r
52 return new (ret) T(a);
\r
55 template<class T,class A,class B>
\r
56 T * New(const A & a,const B & b){
\r
57 T * ret = __DZ_ALLOC<T>().allocate(1);
\r
58 return new (ret) T(a,b);
\r
61 template<class T,class A,class B,class C>
\r
62 T * New(const A & a,const B & b,const C & c){
\r
63 T * ret = __DZ_ALLOC<T>().allocate(1);
\r
64 return new (ret) T(a,b,c);
\r
68 typedef char __dummy[sizeof(T)];
\r
71 __DZ_ALLOC<T>().deallocate(p,1);
\r
77 bool operator ()(__NodeBase * n) const{
\r
82 static const char * const SEP = " ";
\r
84 static __DZ_STRING sep(int lvl)
\r
92 static void appendNode(__NodeBase *& parent,__NodeBase * node)
\r
97 parent = New<__Seq>(node);
\r
99 parent->AppendNode(node);
\r
104 inline bool IsRepeat(int ch){
\r
105 return ch == '?' || ch == '+' || ch == '*';
\r
108 inline bool IsBegin(int ch){
\r
112 inline bool IsEnd(int ch){
\r
116 inline bool IsSlash(int ch){
\r
120 inline bool IsSetBegin(int ch){
\r
124 inline bool IsSetEnd(int ch){
\r
128 inline bool IsGroupBegin(int ch){
\r
132 inline bool IsGroupEnd(int ch){
\r
136 inline bool IsSelect(int ch){
\r
140 inline bool IsRepeatBegin(int ch){
\r
144 inline bool IsRepeatEnd(int ch){
\r
148 inline bool NeedEnd(int ch){
\r
149 return IsGroupEnd(ch) || IsRepeatEnd(ch);
\r
152 inline bool IsDigit(int ch){
\r
153 return '0' <= ch && ch <= '9';
\r
156 inline int TransDigit(int ch){
\r
160 inline bool IsDash(int ch){
\r
164 inline bool IsAny(int ch){
\r
168 inline int IsSubexpMark(int ch){
\r
169 return (ch == ':' || ch == '=' || ch == '!' || ch == '>' ? ch : 0);
\r
172 inline int IsSubexpMark(const char * s){
\r
173 return (*s == '?' ? IsSubexpMark(*(s + 1)) : 0);
\r
176 inline char TransSlash(int ch){
\r
178 case 'f':return '\f';
\r
179 case 'n':return '\n';
\r
180 case 'r':return '\r';
\r
181 case 't':return '\t';
\r
182 case 'v':return '\v';
\r
188 //struct __ParseData
\r
189 int __ParseData::inEnds(int ch) const
\r
192 for(__Ends::const_reverse_iterator i = ends_.rbegin();i != ends_.rend();++i,++ret){
\r
195 if(Tools::NeedEnd(*i))
\r
201 //struct __NodeBase
\r
202 __NodeBase * const __NodeBase::REP_NULL = (__NodeBase *)1;
\r
205 int __NodeBase::ref = 0;
\r
208 __NodeBase::~__NodeBase()
\r
215 int __NodeBase::Repeat(int ch)
\r
220 void __NodeBase::AppendNode(__NodeBase * node)
\r
226 __Edge::__Edge(int ch)
\r
227 : begin_(ch == '^')
\r
230 __NodeBase * __Edge::Optimize(__ParseData & pdata)
\r
235 void __Edge::RandString(__GenerateData & gdata) const
\r
237 _OSS_OUT("__Edge");
\r
240 void __Edge::Debug(std::ostream & out,int lvl) const
\r
242 out<<sep(lvl)<<(begin_ ? "BEGIN" : "END")<<"\n";
\r
246 __Text::__Text(int ch)
\r
250 __NodeBase * __Text::Optimize(__ParseData & pdata)
\r
252 return (str_.empty() ? REP_NULL : 0);
\r
255 void __Text::RandString(__GenerateData & gdata) const
\r
258 _OSS_OUT("__Text");
\r
261 void __Text::Debug(std::ostream & out,int lvl) const
\r
263 out<<sep(lvl)<<"Text("<<str_<<")\n";
\r
267 __Charset::__Charset()
\r
271 __Charset::__Charset(const __DZ_STRING & str,bool include)
\r
276 __NodeBase * __Charset::Optimize(__ParseData & pdata)
\r
282 inc_ = str_.size();
\r
286 void __Charset::RandString(__GenerateData & gdata) const
\r
288 assert(inc_ == str_.size());
\r
289 gdata.oss_<<str_[rand() % inc_];
\r
290 _OSS_OUT("__Charset");
\r
293 void __Charset::Debug(std::ostream & out,int lvl) const
\r
295 out<<sep(lvl)<<"Charset(INCLUDE"
\r
296 <<", "<<str_<<")\n";
\r
299 void __Charset::Exclude()
\r
304 void __Charset::AddChar(int ch)
\r
306 str_.push_back(ch);
\r
309 void __Charset::AddRange(int from,int to)
\r
311 for(;from <= to;++from)
\r
312 str_.push_back(from);
\r
315 void __Charset::AddRange(__Charset * node)
\r
322 void __Charset::Unique()
\r
324 inc_ ? unique() : reverse();
\r
327 void __Charset::unite(__Charset & node)
\r
334 void __Charset::reverse()
\r
336 const int _CHAR_MIN = 32;
\r
337 const int _CHAR_MAX = 126;
\r
342 size_t i = 0,e = s.size();
\r
343 for(;c <= _CHAR_MAX && i < e;++i){
\r
346 AddRange(c,ch - 1);
\r
347 c = std::max(ch + 1,_CHAR_MIN);
\r
350 AddRange(c,_CHAR_MAX);
\r
354 void __Charset::unique()
\r
357 std::sort(str_.begin(),str_.end());
\r
358 str_.erase(std::unique(str_.begin(),str_.end()),str_.end());
\r
363 __Repeat::__Repeat(__NodeBase * node,int ch)
\r
369 case '?':min_ = 0;max_ = 1;break;
\r
370 case '+':min_ = 1;max_ = INFINITE;break;
\r
371 case '*':min_ = 0;max_ = INFINITE;break;
\r
376 __Repeat::__Repeat(__NodeBase * node,int min,int max)
\r
382 __Repeat::~__Repeat(){
\r
386 __NodeBase * __Repeat::Optimize(__ParseData & pdata)
\r
388 min_ &= _CLEAR_FLAGS;
\r
389 max_ &= _CLEAR_FLAGS;
\r
391 max_ = min_ + pdata.config_.repeatInfinite;
\r
392 if( max_ > _REPEAT_MAX)
\r
393 max_ = _REPEAT_MAX;
\r
395 if(!node_ || (min_ > max_) || (!min_ && !max_))
\r
397 __NodeBase * r = node_->Optimize(pdata);
\r
404 if(1 == max_ && 1 == min_){
\r
413 void __Repeat::RandString(__GenerateData & gdata) const
\r
415 for(int t = min_ + rand() % max_;t > 0;t--)
\r
416 node_->RandString(gdata);
\r
417 _OSS_OUT("__Repeat");
\r
420 void __Repeat::Debug(std::ostream & out,int lvl) const
\r
422 out<<sep(lvl)<<"Repeat["<<min_<<", "<<(min_ + max_ - 1)<<"]\n";
\r
425 node_->Debug(out,lvl);
\r
427 out<<sep(lvl)<<"NULL\n";
\r
430 int __Repeat::Repeat(int ch)
\r
434 case '?':min_ |= _NON_GREEDY;return 2;break;
\r
435 case '+':min_ |= _PROSSESSIVE;return 2;break;
\r
443 __Seq::__Seq(__NodeBase * node)
\r
448 for(__Con::const_iterator i = seq_.begin(),e = seq_.end();i != e;++i)
\r
452 __NodeBase * __Seq::Optimize(__ParseData & pdata)
\r
456 for(__Con::iterator i = seq_.begin(),e = seq_.end();i != e;++i)
\r
458 __NodeBase * r = (*i)->Optimize(pdata);
\r
461 *i = (r == REP_NULL ? 0 : r);
\r
464 seq_.erase(std::remove_if(seq_.begin(),seq_.end(),__IsNull()),seq_.end());
\r
467 if(seq_.size() == 1){
\r
468 __NodeBase * r = seq_[0];
\r
475 void __Seq::RandString(__GenerateData & gdata) const
\r
477 for(__Con::const_iterator i = seq_.begin(),e = seq_.end();i != e;++i)
\r
478 (*i)->RandString(gdata);
\r
482 void __Seq::Debug(std::ostream & out,int lvl) const
\r
484 out<<sep(lvl)<<"Seq("<<seq_.size()<<")\n";
\r
486 for(__Con::const_iterator i = seq_.begin(),e = seq_.end();i != e;++i){
\r
488 (*i)->Debug(out,lvl);
\r
490 out<<sep(lvl)<<"NULL\n";
\r
494 void __Seq::AppendNode(__NodeBase * node)
\r
497 if(__Text * cur = dynamic_cast<__Text *>(node))
\r
498 if(__Text * prev = dynamic_cast<__Text *>(seq_.back())){
\r
503 seq_.push_back(node);
\r
507 __Group::__Group(__NodeBase * node,int mark)
\r
511 if(!Tools::IsSubexpMark(mark_))
\r
515 __Group::~__Group()
\r
520 __NodeBase * __Group::Optimize(__ParseData & pdata)
\r
522 if(!node_ || mark_ == '!')
\r
524 __NodeBase * r = node_->Optimize(pdata);
\r
540 mark_ = (mark_ & (INDEX - 1)) - 1;
\r
544 void __Group::RandString(__GenerateData & gdata) const
\r
547 assert(0 <= mark_ && mark_ < MAX_GROUPS);
\r
548 if(mark_ >= gdata.refs_.size())
\r
549 gdata.refs_.resize(mark_ + 1);
\r
550 gdata.refs_.back() = __RefValue(gdata.oss_.str().size(),__DZ_STRING::npos);
\r
551 node_->RandString(gdata);
\r
552 assert(mark_ < gdata.refs_.size());
\r
553 gdata.refs_[mark_].second = gdata.oss_.str().size() - gdata.refs_[mark_].first;
\r
554 _OSS_OUT("__Group");
\r
557 void __Group::Debug(std::ostream & out,int lvl) const
\r
559 out<<sep(lvl)<<"Group(";
\r
561 case ':':out<<"?:";break;
\r
562 case '=':out<<"?=";break;
\r
563 case '!':out<<"?!";break;
\r
564 case '>':out<<"?>";break;
\r
565 default:out<<(mark_ & (INDEX - 1));
\r
570 node_->Debug(out,lvl);
\r
572 out<<sep(lvl)<<"NULL\n";
\r
576 __Select::__Select(__NodeBase * node)
\r
581 __Select::~__Select()
\r
583 for(__Con::const_iterator i = sel_.begin(),e = sel_.end();i != e;++i)
\r
587 __NodeBase * __Select::Optimize(__ParseData & pdata)
\r
591 for(__Con::iterator i = sel_.begin(),e = sel_.end();i != e;++i)
\r
593 __NodeBase * r = (*i)->Optimize(pdata);
\r
596 *i = (r == REP_NULL ? 0 : r);
\r
599 sel_.erase(std::remove_if(sel_.begin(),sel_.end(),__IsNull()),sel_.end());
\r
602 if(sel_.size() == 1){
\r
603 __NodeBase * r = sel_[0];
\r
611 void __Select::RandString(__GenerateData & gdata) const
\r
614 sel_[rand() % sz_]->RandString(gdata);
\r
615 _OSS_OUT("__Select");
\r
618 void __Select::Debug(std::ostream & out,int lvl) const
\r
620 out<<sep(lvl)<<"Select("<<sel_.size()<<")\n";
\r
622 for(__Con::const_iterator i = sel_.begin(),e = sel_.end();i != e;++i)
\r
624 (*i)->Debug(out,lvl);
\r
626 out<<sep(lvl)<<"NULL\n";
\r
629 void __Select::AppendNode(__NodeBase * node)
\r
631 sel_.push_back(node);
\r
635 __Ref::__Ref(int index)
\r
639 __NodeBase * __Ref::Optimize(__ParseData & pdata)
\r
645 void __Ref::RandString(__GenerateData & gdata) const
\r
647 assert(index_ < gdata.refs_.size());
\r
648 const __RefValue & ref = gdata.refs_[index_];
\r
649 __DZ_STRING str = gdata.oss_.str();
\r
650 if(ref.first < str.size())
\r
651 gdata.oss_<<str.substr(ref.first,ref.second);
\r
652 _OSS_OUT("__Ref("<<index_<<")");
\r
655 void __Ref::Debug(std::ostream & out,int lvl) const
\r
657 out<<sep(lvl)<<"Ref("<<index_<<")\n";
\r
660 //class __CRegxString
\r
661 __CRegxString::__CRegxString()
\r
665 void __CRegxString::ParseRegx(const __DZ_STRING & regx,const Config * config)
\r
672 __ParseData pdata(config ? *config : def);
\r
673 top_ = processSeq(pdata).first;
\r
676 __NodeBase * r = top_->Optimize(pdata);
\r
679 top_ = (r == __NodeBase::REP_NULL ? 0 : r);
\r
682 // srand((unsigned int)time(0));
\r
683 // Changed by Adrian Lita
\r
686 const __DZ_STRING & __CRegxString::RandString()
\r
690 __DZ_OSTRINGSTREAM oss;
\r
691 __GenerateData gdata(oss);
\r
692 top_->RandString(gdata);
\r
698 void __CRegxString::Debug(std::ostream & out) const{
\r
699 out<<"regx_ : "<<regx_<<"\nstructure :\n";
\r
701 top_->Debug(out,0);
\r
706 void __CRegxString::uninit()
\r
715 __CRegxString::__Ret __CRegxString::processSeq(__ParseData & pdata)
\r
718 __NodeBase * cur = 0;
\r
720 for(const size_t e = regx_.length();pdata.i_ < e;++pdata.i_){
\r
721 int ch = regx_[pdata.i_];
\r
723 if(Tools::IsBegin(ch)){
\r
724 cur = New<__Edge>(ch);
\r
729 if(Tools::IsRepeat(ch) && cur){
\r
730 int r = cur->Repeat(ch);
\r
733 cur = New<__Repeat>(cur,ch);
\r
737 if(Tools::IsRepeatBegin(ch)){
\r
738 cur = processRepeat(cur,pdata);
\r
741 appendNode(ret.first,cur);
\r
742 ret.second = pdata.inEnds(ch);
\r
745 if(Tools::IsSelect(ch))
\r
746 return processSelect(ret.first,pdata);
\r
747 if(Tools::IsEnd(ch))
\r
748 cur = New<__Edge>(ch);
\r
749 else if(Tools::IsAny(ch)){
\r
750 __Charset * set = New<__Charset>("\n",false);
\r
753 }else if(Tools::IsSetBegin(ch))
\r
754 cur = processSet(pdata);
\r
755 else if(Tools::IsGroupBegin(ch))
\r
756 cur = processGroup(pdata);
\r
757 else if(Tools::IsSlash(ch))
\r
758 cur = processSlash(true,pdata).first;
\r
760 cur = New<__Text>(ch);
\r
762 appendNode(ret.first,cur);
\r
766 __CRegxString::__Ret __CRegxString::processSlash(bool bNode,__ParseData & pdata)
\r
769 __Ret ret((__NodeBase *)0,pdata.i_ < regx_.length() ? Tools::TransSlash(regx_[pdata.i_]) : '\\');
\r
770 __Charset * set = 0;
\r
771 switch(ret.second){
\r
772 case 'd':set = New<__Charset>("0123456789",true);break;
\r
773 case 'D':set = New<__Charset>("0123456789",false);break;
\r
774 case 's':set = New<__Charset>(/*"\f\n\r\v"*/"\t ",true);break;
\r
775 case 'S':set = New<__Charset>(/*"\f\n\r\v"*/"\t ",false);break;
\r
776 case 'w':{ //A-Za-z0-9_
\r
777 set = New<__Charset>();
\r
778 set->AddRange('A','Z');
\r
779 set->AddRange('a','z');
\r
780 set->AddRange('0','9');
\r
783 case 'W':{ //^A-Za-z0-9_
\r
784 set = New<__Charset>();
\r
785 set->AddRange('A','Z');
\r
786 set->AddRange('a','z');
\r
787 set->AddRange('0','9');
\r
797 if(Tools::IsDigit(ret.second)){
\r
798 int i = ret.second - '0';
\r
801 else if(i <= pdata.ref_)
\r
802 ret.first = New<__Ref>(i);
\r
805 ret.first = New<__Text>(ret.second);
\r
810 __NodeBase * __CRegxString::processSet(__ParseData & pdata)
\r
812 size_t bak = pdata.i_++;
\r
813 __Charset * ret = New<__Charset>();
\r
816 for(const size_t e = regx_.length();pdata.i_ < e;++pdata.i_){
\r
817 int ch = regx_[pdata.i_];
\r
818 if(begin && Tools::IsBegin(ch)){
\r
823 if(Tools::IsDash(ch) && prev){
\r
825 if(processRange(to,pdata)){
\r
826 ret->AddRange(prev,to);
\r
832 ret->AddChar(prev);
\r
833 if(Tools::IsSetEnd(ch)){
\r
837 if(Tools::IsSlash(ch)){
\r
838 __Ret s = processSlash(false,pdata);
\r
839 if(s.first){ //charset
\r
840 ret->AddRange(dynamic_cast<__Charset *>(s.first));
\r
851 return New<__Text>('[');
\r
854 __NodeBase * __CRegxString::processGroup(__ParseData & pdata)
\r
856 int bak = pdata.i_++;
\r
857 int mark = ignoreSubexpMarks(pdata);
\r
858 pdata.ends_.push_back(')');
\r
860 mark = ++pdata.ref_;
\r
861 __Ret ret = processSeq(pdata);
\r
862 pdata.ends_.pop_back();
\r
864 return New<__Group>(ret.first,mark);
\r
867 return New<__Text>('(');
\r
870 __CRegxString::__Ret __CRegxString::processSelect(__NodeBase * node,__ParseData & pdata)
\r
872 __Ret ret(New<__Select>(node),0);
\r
873 pdata.ends_.push_back('|');
\r
874 for(const size_t e = regx_.length();pdata.i_ < e;){
\r
876 __Ret r = processSeq(pdata);
\r
877 ret.first->AppendNode(r.first);
\r
879 ret.second = r.second - 1;
\r
883 pdata.ends_.pop_back();
\r
887 __NodeBase * __CRegxString::processRepeat(__NodeBase * node,__ParseData & pdata)
\r
889 if(node && node->Repeat(0)){
\r
890 size_t bak = pdata.i_++;
\r
891 int min = 0,max = __Repeat::INFINITE;
\r
892 switch(processInt(min,pdata)){
\r
895 if(processInt(max,pdata) == '}')
\r
896 return New<__Repeat>(node,min,(min < max ? max : min));
\r
899 return New<__Repeat>(node,min,min);
\r
904 return New<__Text>('{');
\r
907 int __CRegxString::processInt(int & result,__ParseData & pdata)
\r
910 for(const size_t e = regx_.length();pdata.i_ < e;++pdata.i_){
\r
911 int ch = regx_[pdata.i_];
\r
912 if(Tools::IsDigit(ch)){
\r
913 ch = Tools::TransDigit(ch);
\r
929 bool __CRegxString::processRange(int & result,__ParseData & pdata)
\r
931 if(++pdata.i_ < regx_.size() && regx_[pdata.i_] != ']'){
\r
932 result = regx_[pdata.i_];
\r
939 int __CRegxString::ignoreSubexpMarks(__ParseData & pdata)
\r
942 if(pdata.i_ + 1 < regx_.size()){
\r
943 ret = Tools::IsSubexpMark(®x_[pdata.i_]);
\r