/*
 * this wrapper coded by Michael Tesch with ideas from Max Okumoto.
 * Released under the GPL.  A copy of the GPL can be found almost anywhere,
 * but the most official is at www.gnu.org
 *
 * Copyright (c) 2000 Michael Tesch
 * (send bug fixes / improvements to tesch@cs.wisc.edu)
 * version 0.01
 */

#include "pre.h"

pRegex::~pRegex()
{
    if (_pcre)
	pcre_free(_pcre);
    if (_pcre_extra)
	pcre_free(_pcre_extra);
}

/*
 * translate perl modifier flags into pcre flags
 */
unsigned
pRegex::parse_mods(const char *m)
{
    unsigned opts = 0;

    if (!*m)
	return _pcre_opts;

    /*
     * translate perl flags into pcre flags
     */
    while (*m) {
	switch (*m) {
	case 'e':
//	    opts |= _PREGEX_EXPR_HAHA_NOWAY;
	    cerr << "pRegex /e : excercise for the reader.\n";
	    abort();
	    break;
	case 'g':
	    opts |= PREGEX::_PREGEX_GLOBAL;
	    break;
	case 'i':
	    opts |= PCRE_CASELESS;
	    break;
	case 'm':
	    opts |= PCRE_MULTILINE;
	    break;
	case 'o':
	    opts |= PREGEX::_PREGEX_OPTIMIZE;
	    break;
	case 's':
	    opts |= PCRE_DOTALL;
	    break;
	case 'x':
	    opts |= PCRE_EXTENDED;
	    break;
	case 'd': // special flag to enable debug printing (not Perl compat.)
	    opts |= PREGEX::_PREGEX_DEBUG_MSGS;
	    break;
	default:
	    cerr << "illegal pattern modifier: '" << *m << "'\n";

	    // should throw exception, but this'll get their attention too.
	    abort();
	}
	++m;
    }
    return opts;
}

/*
 * compile the _pattern for use.
 */
void
pRegex::compile()
{
    if (_pcre)
	pcre_free(_pcre);

    const char *errstr;
    int errloc;

    if (_pcre_opts & PREGEX::_PREGEX_DEBUG_MSGS)
	cout << "PREGEX compiling '" << _pattern << "'...\n";

    _pcre = pcre_compile(_pattern.c_str(), _pcre_opts & PREGEX::_opt_mask, 
			 &errstr, &errloc, NULL);

    if (!_pcre) {
	cerr << "compilation of pRegex(" << _pattern << ") failed at: " 
	     << errloc << " because " << errstr << "\n";

	// should really throw exception or something
	abort();
    }

    if (_pcre_extra || (_pcre_opts & PREGEX::_PREGEX_OPTIMIZE))
	study();
}

/*
 * send the pattern to the optimizer
 * (untested)
 */
void
pRegex::study()
{
    if (_pcre_extra)
	pcre_free(_pcre_extra);

    if (_pcre_opts & PREGEX::_PREGEX_DEBUG_MSGS)
	cout << "PREGEX studying '" << _pattern << "'...\n";

    const char *errstr;
    _pcre_extra = pcre_study(_pcre, _pcre_opts & PREGEX::_opt_mask, &errstr);

    if (!_pcre_extra && errstr) {
	cerr << "study of pRegex(" << _pattern 
	     << ") failed: " << errstr << "\n";

	// should really throw exception or something
	abort();
    }
}

/*
 * return number of matches, plus matched sub patterns
 */
int
pRegex::mn(const string &s, const char *mods = "")
{
    unsigned opts = parse_mods(mods);
    int n;

    if (!_pcre || opts != _pcre_opts) {
	_pcre_opts = opts;
	compile();
    }

    // if global, check if string changed
    if (_pcre_opts & PREGEX::_PREGEX_GLOBAL)
	if (s != _last_m_str) {
	    _m_offset = 0;
	    _last_m_str = s;
	} else {}
    else
	_m_offset = 0;

    int *ovec = new int[300];

    n = pcre_exec(_pcre, _pcre_extra, s.c_str(), s.length(), _m_offset, 
		  _pcre_opts & PREGEX::_opt_mask, ovec, 300);

//    cout << _pattern << " matched '" << s << "' n=" << n << "\n";

    delete[] ovec;

    if (n == PCRE_ERROR_NOMATCH)
	return 0;

    if (n <= 0) {
	cerr << "pcre_exec error = " << n << "\n";
	abort();
    }

    // this *INCLUDES* the first match pair, which is super sub-patterns
    return n;
}

/*
 * returns # of subs made
 *
 * this fxn copied almost verbatim from Max Okumoto's code.
 */
int
pRegex::do_subs(const string s, string & final, const char *replace_pat, 
		int *ovec, int nmatch)
{
    int nsubs = 0;
    const char *p = replace_pat;
    int subnum = 0;
    int state = 0;

    while (state != -1) {
	switch (state) {
	case 0:
	    if (!*p) {
		state = -1;
		break;
	    }
	    if (*p == '$') {
		state = 1;
		subnum = 0;
		if (p[1] == '&') {
		    p++;
		    if (isdigit(p[1]))
			p++;
		} else if (!isdigit(p[1])) {
		    cerr << "badly formed replacement pattern: " 
			 << replace_pat << "\n";
		    abort();  // woo hoo!  error recovery!  crash into mars!
		}
	    } else 
		final += *p;
	    break;
	case 1:
	    if (isdigit(*p)) {
		subnum *= 10;
		subnum += (*p) - '0';
	    } else {
		char substr[4096];
		int status;

		if (_pcre_opts & PREGEX::_PREGEX_DEBUG_MSGS)
		    cout << "PREGEX appending substr #" << subnum << "...\n";

		status = pcre_copy_substring(s.c_str(),
					     ovec, nmatch,
					     subnum, substr, 4096);

		switch (status) {
		case PCRE_ERROR_NOMEMORY:	/* buffer too small */
		case PCRE_ERROR_NOSUBSTRING:	/* bad stringnumber */
		    assert(0);
		}
		final += substr;
		nsubs++;

		state = 0;
		continue;	/* send char to start state */
	    }
	}
	++p;
    }
    return nsubs;
}

/*
 * substitute
 *  returns number of substitutions made
 */
int
pRegex::s(string &s, const char *replace_pat, const char *mods)
{
    unsigned opts = parse_mods(mods);
    int nmatch;
    int nsubs = 0;
    string final("");

    if (!_pcre || opts != _pcre_opts) {
	_pcre_opts = opts;
	compile();
    }

    int *ovec = new int[300];

    int offset = 0;
    int last = 0;
    for (;;) {

	// find next matching subs
	nmatch = pcre_exec(_pcre, _pcre_extra, s.c_str(), s.length(), offset, 
			   _pcre_opts & PREGEX::_opt_mask, ovec, 300);

	// doesn't work exactly like perl
	if (nmatch <= 0)
	    if (nmatch != PCRE_ERROR_NOMATCH) {
		cerr << "pcre_exec error = " << nmatch << "\n";
		abort();
	    } else {
		break;
	    }

	// append anything previously unmatched, but not substituted
	if (last <= ovec[0]) {
	    final += s.substr(last, ovec[0] - last);
	    last = ovec[1];
	}

	// replace stuff in s
	nsubs += do_subs(s, final, replace_pat, ovec, nmatch);

	// if global gotta check match at every pos
	if (!(_pcre_opts & PREGEX::_PREGEX_GLOBAL))
	    break;

	if (ovec[0] != ovec[1])
	    offset = ovec[1];
	else {
	    // matched empty string
	    if (ovec[1] == (int)s.length())
		break;
	    offset = ovec[1] + 1;
	}
    }
    delete[] ovec;

    final += s.substr(last);
    s = final;
    return nsubs;
}

std::ostream& operator<< (std::ostream& os, const pRegex& p)
{
    os << "\"" << p._pattern << "\"";
    return os;
}

#ifdef TEST_PREGEX

bool
test_m(const char *str, const char *pat, bool should_match)
{
    bool ret;

    if (pRegex(pat).m(str) != should_match) {
	cout << "M FAILED: ";
	ret = 0;
    } else 
	ret = 1;

    cout << should_match << " = (" << str << " ~= " << pat << ")\n";

    return ret;
}

bool
test_s(const char *str, const char *pat, const char *repl, const char *mods,
       const char *res)
{
    int n;
    bool ret;
    string tmp(str);

    n = pRegex(pat).s(tmp, repl, mods);

    if (res != tmp) {
	cout << "S FAILED: ";
	ret = 0;
    } else
	ret = 1;

    cout << n << " = (" << str << " ~= m/" << pat << "/" << repl << "/" 
	 << mods << ") -> " << tmp << "\n";

    return ret;
}

int
main()
{
    /*** from header ***/
    string t, s = "man on the roof";
    if (pRegex("on the").m(s))
        cout << "matched\n";
    // prints "matched"

    t = "see with a telescope";
    if (pRegex("a (telescope|sled)").s(t, "the $1"))
        cout << "new str:\n";
    cout << t << "\n";
    t = "see with a sled";
    if (pRegex("a (telescope|sled)").s(t, "the $1"))
        cout << "new str:\n";
    cout << t << "\n";
    /* *************** */

    /*
     * m() is pretty lame brained, probably wont fail. ('less pcre does.)
     */
    test_m("mojo", "mojo", 1);
    test_m("mojo", "mofo", 0);
    test_m("mojo", "oj", 1);
    test_m("mojo", "m..o", 1);
    test_m("mojo", "m.*o", 1);
    test_m("mao-po tofu", "\\bpo\\b", 1);
    test_m("mao-po tofu", "\\ba-po\\b", 0);

    /*
     * s() is pretty lame brained, it probably has bugs
     */
    test_s("mojo", "oj", "of", "", "mofo");
    test_s("mojo", "m(.*)o", "$1mo", "", "ojmo");
    test_s("mooo", "m(.*)o", "$1", "", "oo");
    test_s("mooo", "o", "p", "", "mpoo");

    /* 
     * global substitutions 
     */
    test_s("mooooooooooooooooo", "o", "p", "g", "mppppppppppppppppp");
    test_s("mooo", "oo", "p", "g", "mpo");
    test_s("mooo", "\\b", "p", "g", "pmooop");
    test_s("mooo", "\\B", "p", "g", "mpopopo");
    test_s("mooo", "", "p", "g", "pmpopopop");
    test_s("mooo", "o", "*$&*", "g", "m*o**o**o*");
    test_s("mooo", "(o)", "*$1$&*", "g", "m*oo**oo**oo*");
    test_s("cat and hat", "(\\w*) and (\\w*)", "$2 or $1", "g", "hat or cat");
//    test_s("mooo", "(o)", "*$2$&*", "g", "m*oo**oo**oo*"); bad str number

    /*
     * 
     */
    test_s("atfata", "a([^a]*)a", "b$1b", "g", "btfbta");

    test_s("atfata", "a([^a]*)a", "b$1b", "go", "btfbta");
}

#endif
