#!/usr/bin/python
r"""Usage: pyregex.py [options] "-"|filename regex [replacement [count]]

Test Python regular expressions. Specify test data's filename or use "-"
to enter test text from console. Optionally specify a replacement text.

Options:
-f      filter mode
-n nnn  limit to examine the first nnn lines. default no limit.
-m      show only matched line. default False


Regular Expression Syntax

Special Characters
------------------------------------------------------------------------
.       matches any character except a newline
^       matches the start of the string
$       matches the end of the string or just before the newline at the end of
        the string
*       matches 0 or more repetitions of the preceding RE
+       matches 1 or more repetitions of the preceding RE
?       matches 0 or 1 repetitions of the preceding RE
{m}     exactly m copies of the previous RE should be matched
{m,n}   matches from m to n repetitions of the preceding RE
\       either escapes special characters or signals a special sequence
[]      indicate a set of characters. Characters can be listed individually,
        or a range of characters can be indicated by giving two characters and
        separating them by a "-". Special characters are not active inside sets
        Including a "^" as the first character match the complement of the set
|       A|B matches either A or B
(...)   indicates the start and end of a group
(?...)  this is an extension notation. See documentation for detail
(?iLmsux) I ignorecase; L locale; M multiline; S dotall; U unicode; X verbose

*, +, ? and {m,n} are greedy. Append the ? qualifier to match non-greedily.


Special Sequences
------------------------------------------------------------------------
\number matches the contents of the group of the same number. Groups are
        numbered starting from 1
\A      matches only at the start of the string
\b      matches the empty string at the beginning or end of a word
\B      matches the empty string not at the beginning or end of a word
\d      matches any decimal digit
\D      matches any non-digit character
\g<name>use the substring matched by the group named 'name' for sub()
\s      matches any whitespace character
\S      matches any non-whitespace character
\w      matches any alphanumeric character and the underscore
\W      matches any non-alphanumeric character
\Z      matches only at the end of the string


See the Python documentation on Regular Expression Syntax for more detail

http://docs.python.org/lib/re-syntax.html
"""

__author__ = "Wai Yip Tung"
__version__ = "0.5"
__url__ = "http://tungwaiyip.info/software/pyregex.html"
__license__ = "Public Domain"

#2006-03-08 support unicode?
#2006-03-08 no multiline support?

import re
import sys

# select console coloring option
USE_ANSI = True
USE_WIN32CONSOLE = False

if 'win32' in sys.platform.lower():
    try:
        import win32console
        import pywintypes
    except ImportError:
        pass
    else:
        USE_ANSI = False
        win32_stdout = win32console.GetStdHandle(win32console.STD_OUTPUT_HANDLE)
        try:
            win32_orig_attr = win32_stdout.GetConsoleScreenBufferInfo()['Attributes']
            USE_WIN32CONSOLE = True
        except pywintypes.error, e:
            # output redirected?
            pass


def writeColor(s):
    """ write with hightlighted color """
    if USE_WIN32CONSOLE:
        # windows console
        win32_stdout.SetConsoleTextAttribute(31)
        sys.stdout.write(s)
        win32_stdout.SetConsoleTextAttribute(win32_orig_attr)
    elif USE_ANSI:
        sys.stdout.write('\x1b[1;44m')
        sys.stdout.write(s)
        sys.stdout.write('\x1b[0m')
    else:
        # ASCII mode
        sys.stdout.write('[')
        sys.stdout.write(s)
        sys.stdout.write(']')


def open_text(p):
    """
    generator to return lines from filename
    note line is stripped of trailing \n
    """
    if p.filename == '-':
        if not p.filter_mode:
            print 'Enter the text below. End with EOF.'
        #get all input first
        lines = []
        while True:
            try:
                line = raw_input('')
            except EOFError:
                break
            lines.append(line)
        for line in lines:
            yield line
    else:
        fp = file(p.filename)
        for line in fp:
            yield line.rstrip()
        fp.close()


def scan(fp, r, p):
    if not p.filter_mode:
        print
    count = 0
    first_match = None
    for i,line in enumerate(fp):

        if p.number_of_lines and i >= p.number_of_lines:
            break

        # find/replace pattern
        # build matches as a list of (match obj, match text)
        if p.repl:
            matches = []
            def substitute(m):
                text = m.expand(p.repl)
                matches.append((m, text))
                return text
            r.sub(substitute, line, p.rcount)
        else:
            matches = [(m, m.group()) for m in r.finditer(line)]

        if p.match_only and not matches:
            continue

        # show result line
        cp = 0
        if not p.filter_mode:
            sys.stdout.write('%2d: ' % (i+1))

        for m,text in matches:
            count += 1
            if not first_match:
                first_match = m
            if m.start() > cp:
                sys.stdout.write(line[cp:m.start()])
            if not p.filter_mode:
                writeColor(text)
            else:
                sys.stdout.write(text)
            cp = m.end()
        if cp < len(line):
            sys.stdout.write(line[cp:])
        sys.stdout.write('\n')

    # show group (first_macth)
    if not p.filter_mode:
        if first_match and first_match.lastindex:
            print '\nGroups:'
            print '\\0: "%s"' % first_match.group(0)
            for i, g in enumerate(first_match.groups()):
                print '\%s: "%s"' % (i+1,g)
            for n,v in first_match.groupdict().items():
                print '%s: "%s"' % (n,v)

    # show final stat
    if not p.filter_mode:
        if count:
            print '\nNumber of matches: %s\n' % count
        else:
            print '\nNo match\n'


def main(p):
    try:
        r = re.compile(p.regex)
    except re.error, e:
        print >>sys.stderr, '%s: "%s"' % (e, p.regex)
        sys.exit(-1)
    fp = open_text(p)
    scan(fp, r, p)


class Parameters:
    def __init__(self):
        self.filename = '-'
        self.filter_mode = False
        self.number_of_lines = 0
        self.match_only = False
        self.regex = ''
        self.repl = ''
        self.rcount = 0


def print_usage():
    print __doc__
    sys.exit(-1)


if __name__ =='__main__':
    argv = sys.argv[1:]

    p = Parameters()

    # parse options
    while argv:
        if argv[0] == '-f':
            argv.pop(0)
            p.filter_mode = True
        elif argv[0] == '-m':
            argv.pop(0)
            p.match_only = True
        elif argv[0] == '-n':
            argv.pop(0)
            if argv and argv[0].isdigit():
                p.number_of_lines = int(argv.pop(0))
            else:
                print_usage()
        else:
            break

    if not argv:
        print_usage()
    p.filename = argv.pop(0)

    if not argv:
        print_usage()
    p.regex = argv.pop(0)

    if argv:
        p.repl = argv.pop(0)

    if argv and argv[0].isdigit():
        p.rcount = int(argv.pop(0))

    main(p)

