// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Scripting;
using Microsoft.Scripting.Runtime;
using Microsoft.Scripting.Utils;
using IronPython.Runtime;
using IronPython.Runtime.Exceptions;
using IronPython.Runtime.Operations;
using IronPython.Runtime.Types;
[assembly: PythonModule("re", typeof(IronPython.Modules.PythonRegex))]
namespace IronPython.Modules {
///
/// Python regular expression module.
///
public static class PythonRegex {
private static CacheDict _cachedPatterns = new CacheDict(100);
[SpecialName]
public static void PerformModuleReload(PythonContext/*!*/ context, PythonDictionary/*!*/ dict) {
context.EnsureModuleException("reerror", dict, "error", "re");
PythonCopyReg.GetDispatchTable(context.SharedContext)[DynamicHelpers.GetPythonTypeFromType(typeof(RE_Pattern))] = dict["_pickle"];
}
private static readonly Random r = new Random(DateTime.Now.Millisecond);
#region CONSTANTS
// short forms
public const int I = 0x02;
public const int L = 0x04;
public const int M = 0x08;
public const int S = 0x10;
public const int U = 0x20;
public const int X = 0x40;
public const int A = 0x100;
// long forms
public const int IGNORECASE = 0x02;
public const int LOCALE = 0x04;
public const int MULTILINE = 0x08;
public const int DOTALL = 0x10;
public const int UNICODE = 0x20;
public const int VERBOSE = 0x40;
public const int ASCII = 0x100;
#endregion
#region Public API Surface
public static RE_Pattern compile(CodeContext/*!*/ context, object pattern, int flags = 0) {
try {
return GetPattern(context, pattern, flags, true);
} catch (ArgumentException e) {
throw PythonExceptions.CreateThrowable(error(context), e.Message);
}
}
public const string engine = "cli reg ex";
public static string escape(string text) {
if (text == null) throw PythonOps.TypeError("text must not be None");
for (int i = 0; i < text.Length; i++) {
if (!char.IsLetterOrDigit(text[i])) {
StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
char ch = text[i];
do {
sb.Append('\\');
sb.Append(ch);
i++;
int last = i;
while (i < text.Length) {
ch = text[i];
if (!char.IsLetterOrDigit(ch)) {
break;
}
i++;
}
sb.Append(text, last, i - last);
} while (i < text.Length);
return sb.ToString();
}
}
return text;
}
public static PythonList findall(CodeContext/*!*/ context, object pattern, string @string, int flags = 0) {
RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
ValidateString(@string, nameof(@string));
MatchCollection mc = pat.FindAllWorker(context, @string, 0, @string.Length);
return FixFindAllMatch(pat, mc, null);
}
public static PythonList findall(CodeContext context, object pattern, IList @string, int flags = 0) {
RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
ValidateString(@string, nameof(@string));
MatchCollection mc = pat.FindAllWorker(context, @string, 0, @string.Count);
return FixFindAllMatch(pat, mc, FindMaker(@string));
}
private static Func FindMaker(object input) {
Func maker = null;
if (input is ByteArray) {
maker = delegate (string x) { return new ByteArray(x.MakeByteArray()); };
}
return maker;
}
private static PythonList FixFindAllMatch(RE_Pattern pat, MatchCollection mc, Func maker) {
object[] matches = new object[mc.Count];
int numgrps = pat._re.GetGroupNumbers().Length;
for (int i = 0; i < mc.Count; i++) {
if (numgrps > 2) { // CLR gives us a "bonus" group of 0 - the entire expression
// at this point we have more than one group in the pattern;
// need to return a list of tuples in this case
// for each match item in the matchcollection, create a tuple representing what was matched
// e.g. findall("(\d+)|(\w+)", "x = 99y") == [('', 'x'), ('99', ''), ('', 'y')]
// in the example above, ('', 'x') did not match (\d+) as indicated by '' but did
// match (\w+) as indicated by 'x' and so on...
int k = 0;
List tpl = new List();
foreach (Group g in mc[i].Groups) {
// here also the CLR gives us a "bonus" match as the first item which is the
// group that was actually matched in the tuple e.g. we get 'x', '', 'x' for
// the first match object...so we'll skip the first item when creating the
// tuple
if (k++ != 0) {
tpl.Add(maker != null ? maker(g.Value) : g.Value);
}
}
matches[i] = PythonTuple.Make(tpl);
} else if (numgrps == 2) {
// at this point we have exactly one group in the pattern (including the "bonus" one given
// by the CLR
// skip the first match since that contains the entire match and not the group match
// e.g. re.findall(r"(\w+)\s+fish\b", "green fish") will have "green fish" in the 0
// index and "green" as the (\w+) group match
matches[i] = maker != null ? maker(mc[i].Groups[1].Value) : mc[i].Groups[1].Value;
} else {
matches[i] = maker != null ? maker(mc[i].Value) : mc[i].Value;
}
}
return PythonList.FromArrayNoCopy(matches);
}
public static object finditer(CodeContext/*!*/ context, object pattern, object @string, int flags = 0) {
RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
string str = ValidateString(@string, nameof(@string));
return MatchIterator(pat.FindAllWorker(context, str, 0, str.Length), pat, str);
}
public static RE_Match match(CodeContext/*!*/ context, object pattern, object @string, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).match(ValidateString(@string, nameof(@string)));
public static RE_Match fullmatch(CodeContext/*!*/ context, object pattern, object @string, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).fullmatch(context, ValidateString(@string, nameof(@string)));
public static RE_Match search(CodeContext/*!*/ context, object pattern, object @string, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).search(ValidateString(@string, nameof(@string)));
[return: SequenceTypeInfo(typeof(string))]
public static PythonList split(CodeContext/*!*/ context, object pattern, object @string, int maxsplit = 0, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).split(ValidateString(@string, nameof(@string)), maxsplit);
public static string sub(CodeContext/*!*/ context, object pattern, object repl, object @string, int count = 0, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).sub(context, repl, ValidateString(@string, nameof(@string)), count);
public static object subn(CodeContext/*!*/ context, object pattern, object repl, object @string, int count = 0, int flags = 0)
=> GetPattern(context, ValidatePattern(pattern), flags).subn(context, repl, ValidateString(@string, nameof(@string)), count);
public static void purge() {
_cachedPatterns = new CacheDict(100);
}
#endregion
#region Public classes
///
/// Compiled reg-ex pattern
///
[PythonType]
public class RE_Pattern : IWeakReferenceable {
internal readonly Regex _re;
internal readonly ParsedRegex _pre;
private PythonDictionary _groups;
private WeakRefTracker _weakRefTracker;
private static Regex GenRegex(CodeContext/*!*/ context, string pattern, int flags, bool compiled, bool fullmatch) {
try {
RegexOptions opts = FlagsToOption(flags);
return new Regex(fullmatch ? $"(?:{pattern})\\Z" : pattern, opts | (compiled ? RegexOptions.Compiled : RegexOptions.None));
} catch (ArgumentException e) {
throw PythonExceptions.CreateThrowable(error(context), e.Message);
}
}
internal RE_Pattern(CodeContext/*!*/ context, object pattern, int flags = 0, bool compiled = false) {
_pre = PreParseRegex(context, ValidatePatternAsString(pattern), (flags & VERBOSE) != 0);
flags |= OptionToFlags(_pre.Options);
_re = GenRegex(context, _pre.Pattern, flags, compiled, false);
this.flags = flags;
}
public RE_Match match(object text) {
string input = ValidateString(text, nameof(text));
return RE_Match.makeMatch(_re.Match(input), this, input, 0, input.Length);
}
private static int FixPosition(string text, int position) {
if (position <= 0) return 0;
if (position > text.Length) return text.Length;
return position;
}
public RE_Match match(object text, int pos) {
string input = ValidateString(text, nameof(text));
pos = FixPosition(input, pos);
return RE_Match.makeMatch(_re.Match(input, pos), this, input, pos, input.Length);
}
public RE_Match match(object text, [DefaultParameterValue(0)]int pos, int endpos) {
string input = ValidateString(text, nameof(text));
pos = FixPosition(input, pos);
endpos = FixPosition(input, endpos);
return RE_Match.makeMatch(
_re.Match(input.Substring(0, endpos), pos),
this,
input,
pos,
endpos);
}
private Regex _re_fullmatch;
private Regex GetRegexFullMatch(CodeContext /*!*/ context) {
if (_re_fullmatch == null) {
lock (_re) {
if (_re_fullmatch == null)
_re_fullmatch = GenRegex(context, _pre.Pattern, flags, _re.Options.HasFlag(RegexOptions.Compiled), true);
}
}
return _re_fullmatch;
}
public RE_Match fullmatch(CodeContext/*!*/ context, object text, int pos = 0) {
string input = ValidateString(text, nameof(text));
pos = FixPosition(input, pos);
return RE_Match.makeFullMatch(GetRegexFullMatch(context).Match(input, pos), this, input, pos, input.Length);
}
public RE_Match fullmatch(CodeContext/*!*/ context, object text, [DefaultParameterValue(0)]int pos, int endpos) {
string input = ValidateString(text, nameof(text));
pos = FixPosition(input, pos);
endpos = FixPosition(input, endpos);
return RE_Match.makeFullMatch(
GetRegexFullMatch(context).Match(input.Substring(0, endpos), pos),
this,
input,
pos,
endpos);
}
public RE_Match search(object text) {
string input = ValidateString(text, nameof(text));
return RE_Match.make(_re.Match(input), this, input);
}
public RE_Match search(object text, int pos) {
string input = ValidateString(text, nameof(text));
if (pos < 0) pos = 0;
return RE_Match.make(_re.Match(input, pos), this, input);
}
public RE_Match search(object text, int pos, int endpos) {
string input = ValidateString(text, nameof(text));
if (pos < 0) pos = 0;
if (endpos < pos) return null;
if (endpos < input.Length) input = input.Substring(0, endpos);
return RE_Match.make(_re.Match(input, pos), this, input);
}
public object findall(CodeContext/*!*/ context, object @string, int pos = 0, object endpos = null) {
MatchCollection mc = FindAllWorker(context, ValidateString(@string, nameof(@string)), pos, endpos);
return FixFindAllMatch(this, mc, FindMaker(@string));
}
internal MatchCollection FindAllWorker(CodeContext/*!*/ context, string str, int pos, object endpos) {
string against = str;
if (endpos != null) {
int end = context.LanguageContext.ConvertToInt32(endpos);
against = against.Substring(0, Math.Max(end, 0));
}
return _re.Matches(against, pos);
}
internal MatchCollection FindAllWorker(CodeContext/*!*/ context, IList str, int pos, object endpos)
=> FindAllWorker(context, str.MakeString(), pos, endpos);
public object finditer(CodeContext/*!*/ context, object @string, int pos=0) {
string input = ValidateString(@string, nameof(@string));
return MatchIterator(FindAllWorker(context, input, pos, null), this, input);
}
public object finditer(CodeContext/*!*/ context, object @string, int pos, int endpos) {
string input = ValidateString(@string, nameof(@string));
return MatchIterator(FindAllWorker(context, input, pos, endpos), this, input);
}
[return: SequenceTypeInfo(typeof(string))]
public PythonList split(object @string, int maxsplit = 0) {
PythonList result = new PythonList();
// fast path for negative maxSplit ( == "make no splits")
if (maxsplit < 0) {
result.AddNoLock(ValidateString(@string, nameof(@string)));
} else {
// iterate over all matches
string theStr = ValidateString(@string, nameof(@string));
MatchCollection matches = _re.Matches(theStr);
int lastPos = 0; // is either start of the string, or first position *after* the last match
int nSplits = 0; // how many splits have occurred?
foreach (Match m in matches) {
if (m.Length > 0) {
// add substring from lastPos to beginning of current match
result.AddNoLock(theStr.Substring(lastPos, m.Index - lastPos));
// if there are subgroups of the match, add their match or None
if (m.Groups.Count > 1)
for (int i = 1; i < m.Groups.Count; i++)
if (m.Groups[i].Success)
result.AddNoLock(m.Groups[i].Value);
else
result.AddNoLock(null);
// update lastPos, nSplits
lastPos = m.Index + m.Length;
nSplits++;
if (nSplits == maxsplit)
break;
}
}
// add tail following last match
result.AddNoLock(theStr.Substring(lastPos));
}
return result;
}
public string sub(CodeContext/*!*/ context, object repl, object @string, int count = 0) {
if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
// if 'count' is omitted or 0, all occurrences are replaced
if (count == 0) count = int.MaxValue;
string replacement = repl as string;
if (replacement == null) {
if (repl is ExtensibleString) {
replacement = ((ExtensibleString)repl).Value;
} else if (repl is Bytes) {
replacement = ((Bytes)repl).MakeString();
}
}
Match prev = null;
string input = ValidateString(@string, nameof(@string));
return _re.Replace(
input,
delegate (Match match) {
// from the docs: Empty matches for the pattern are replaced
// only when not adjacent to a previous match
if (string.IsNullOrEmpty(match.Value) && prev != null &&
(prev.Index + prev.Length) == match.Index) {
return "";
};
prev = match;
if (replacement != null) return UnescapeGroups(match, replacement);
return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
},
count);
}
public object subn(CodeContext/*!*/ context, object repl, object @string, int count = 0) {
if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
// if 'count' is omitted or 0, all occurrences are replaced
if (count == 0) count = int.MaxValue;
int totalCount = 0;
string res;
string replacement = repl as string;
if (replacement == null) {
if (repl is ExtensibleString) {
replacement = ((ExtensibleString)repl).Value;
} else if (repl is Bytes) {
replacement = ((Bytes)repl).MakeString();
}
}
Match prev = null;
string input = ValidateString(@string, nameof(@string));
res = _re.Replace(
input,
delegate (Match match) {
// from the docs: Empty matches for the pattern are replaced
// only when not adjacent to a previous match
if (string.IsNullOrEmpty(match.Value) && prev != null &&
(prev.Index + prev.Length) == match.Index) {
return "";
};
prev = match;
totalCount++;
if (replacement != null) return UnescapeGroups(match, replacement);
return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
},
count);
return PythonTuple.MakeTuple(res, totalCount);
}
public int flags { get; }
public PythonDictionary groupindex {
get {
if (_groups == null) {
PythonDictionary d = new PythonDictionary();
string[] names = _re.GetGroupNames();
int[] nums = _re.GetGroupNumbers();
for (int i = 1; i < names.Length; i++) {
if (char.IsDigit(names[i][0]) || names[i].StartsWith(_mangledNamedGroup)) {
// skip numeric names and our mangling for unnamed groups mixed w/ named groups.
continue;
}
d[names[i]] = nums[i];
}
_groups = d;
}
return _groups;
}
}
public int groups => _re.GetGroupNumbers().Length - 1;
public string pattern => _pre.UserPattern;
public override bool Equals(object obj)
=> obj is RE_Pattern other && other.pattern == pattern && other.flags == flags;
public override int GetHashCode() => pattern.GetHashCode() ^ flags;
#region IWeakReferenceable Members
WeakRefTracker IWeakReferenceable.GetWeakRef() => _weakRefTracker;
bool IWeakReferenceable.SetWeakRef(WeakRefTracker value) {
_weakRefTracker = value;
return true;
}
void IWeakReferenceable.SetFinalizer(WeakRefTracker value) => ((IWeakReferenceable)this).SetWeakRef(value);
#endregion
}
public static PythonTuple _pickle(CodeContext/*!*/ context, RE_Pattern pattern) {
object scope = Importer.ImportModule(context, new PythonDictionary(), "re", false, 0);
if (scope is PythonModule module && module.__dict__.TryGetValue("compile", out object compile)) {
return PythonTuple.MakeTuple(compile, PythonTuple.MakeTuple(pattern.pattern, pattern.flags));
}
throw new InvalidOperationException("couldn't find compile method");
}
[PythonType]
public class RE_Match {
private readonly Match _m;
private int _lastindex = -1;
#region Internal makers
internal static RE_Match make(Match m, RE_Pattern pattern, string input) {
if (m.Success) return new RE_Match(m, pattern, input, 0, input.Length);
return null;
}
internal static RE_Match make(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
if (m.Success) return new RE_Match(m, pattern, input, offset, endpos);
return null;
}
internal static RE_Match makeMatch(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
if (m.Success && m.Index == offset) return new RE_Match(m, pattern, input, offset, endpos);
return null;
}
internal static RE_Match makeFullMatch(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
if (m.Success && m.Index == offset && m.Length == endpos - offset) return new RE_Match(m, pattern, input, offset, endpos);
return null;
}
#endregion
#region Public ctors
public RE_Match(Match m, RE_Pattern pattern, string text) {
_m = m;
re = pattern;
@string = text;
}
public RE_Match(Match m, RE_Pattern pattern, string text, int pos, int endpos) {
_m = m;
re = pattern;
@string = text;
this.pos = pos;
this.endpos = endpos;
}
#endregion
#region Public API Surface
public int end() => _m.Index + _m.Length;
public int start() => _m.Index;
public int start(object group) {
int grpIndex = GetGroupIndex(group);
if (!_m.Groups[grpIndex].Success) {
return -1;
}
return _m.Groups[grpIndex].Index;
}
public int end(object group) {
int grpIndex = GetGroupIndex(group);
if (!_m.Groups[grpIndex].Success) {
return -1;
}
return _m.Groups[grpIndex].Index + _m.Groups[grpIndex].Length;
}
public object group(object index, params object[] additional) {
if (additional.Length == 0) {
return group(index);
}
object[] res = new object[additional.Length + 1];
res[0] = _m.Groups[GetGroupIndex(index)].Success ? _m.Groups[GetGroupIndex(index)].Value : null;
for (int i = 1; i < res.Length; i++) {
int grpIndex = GetGroupIndex(additional[i - 1]);
res[i] = _m.Groups[grpIndex].Success ? _m.Groups[grpIndex].Value : null;
}
return PythonTuple.MakeTuple(res);
}
public string group(object index) {
int pos = GetGroupIndex(index);
Group g = _m.Groups[pos];
return g.Success ? g.Value : null;
}
public string group() => group(0);
[return: SequenceTypeInfo(typeof(string))]
public PythonTuple groups() => groups(null);
public PythonTuple groups(object @default) {
object[] ret = new object[_m.Groups.Count - 1];
for (int i = 1; i < _m.Groups.Count; i++) {
if (!_m.Groups[i].Success) {
ret[i - 1] = @default;
} else {
ret[i - 1] = _m.Groups[i].Value;
}
}
return PythonTuple.MakeTuple(ret);
}
public string expand(object template) {
string strTmp = ValidateString(template, nameof(template));
StringBuilder res = new StringBuilder();
for (int i = 0; i < strTmp.Length; i++) {
if (strTmp[i] != '\\') { res.Append(strTmp[i]); continue; }
if (++i == strTmp.Length) { res.Append(strTmp[i - 1]); continue; }
if (char.IsDigit(strTmp[i])) {
AppendGroup(res, (int)(strTmp[i] - '0'));
} else if (strTmp[i] == 'g') {
if (++i == strTmp.Length) { res.Append("\\g"); return res.ToString(); }
if (strTmp[i] != '<') {
res.Append("\\g<"); continue;
} else { // '<'
StringBuilder name = new StringBuilder();
i++;
while (strTmp[i] != '>' && i < strTmp.Length) {
name.Append(strTmp[i++]);
}
AppendGroup(res, re._re.GroupNumberFromName(name.ToString()));
}
} else {
switch (strTmp[i]) {
case 'n': res.Append('\n'); break;
case 'r': res.Append('\r'); break;
case 't': res.Append('\t'); break;
case '\\': res.Append('\\'); break;
}
}
}
return res.ToString();
}
[return: DictionaryTypeInfo(typeof(string), typeof(string))]
public PythonDictionary groupdict() => groupdict(null);
private static bool IsGroupNumber(string name) {
foreach (char c in name) {
if (!char.IsNumber(c)) return false;
}
return true;
}
[return: DictionaryTypeInfo(typeof(string), typeof(string))]
public PythonDictionary groupdict([NotNull]string value) => groupdict((object)value);
[return: DictionaryTypeInfo(typeof(string), typeof(object))]
public PythonDictionary groupdict(object value) {
string[] groupNames = this.re._re.GetGroupNames();
Debug.Assert(groupNames.Length == this._m.Groups.Count);
PythonDictionary d = new PythonDictionary();
for (int i = 0; i < groupNames.Length; i++) {
if (IsGroupNumber(groupNames[i])) continue; // python doesn't report group numbers
if (_m.Groups[i].Captures.Count != 0) {
d[groupNames[i]] = _m.Groups[i].Value;
} else {
d[groupNames[i]] = value;
}
}
return d;
}
[return: SequenceTypeInfo(typeof(int))]
public PythonTuple span() => PythonTuple.MakeTuple(start(), end());
[return: SequenceTypeInfo(typeof(int))]
public PythonTuple span(object group) => PythonTuple.MakeTuple(start(group), end(group));
public int pos { get; }
public int endpos { get; }
public string @string { get; }
public PythonTuple regs {
get {
object[] res = new object[_m.Groups.Count];
for (int i = 0; i < res.Length; i++) {
res[i] = PythonTuple.MakeTuple(start(i), end(i));
}
return PythonTuple.MakeTuple(res);
}
}
public RE_Pattern re { get; }
public object lastindex {
get {
// -1 : initial value of lastindex
// 0 : no match found
//other : the true lastindex
// Match.Groups contains "lower" level matched groups, which has to be removed
if (_lastindex == -1) {
int i = 1;
while (i < _m.Groups.Count) {
if (_m.Groups[i].Success) {
_lastindex = i;
int start = _m.Groups[i].Index;
int end = start + _m.Groups[i].Length;
i++;
// skip any group which fall into the range [start, end],
// no matter match succeed or fail
while (i < _m.Groups.Count && (_m.Groups[i].Index < end)) {
i++;
}
} else {
i++;
}
}
if (_lastindex == -1) {
_lastindex = 0;
}
}
if (_lastindex == 0) {
return null;
} else {
return _lastindex;
}
}
}
public string lastgroup {
get {
if (lastindex == null) return null;
// when group was not explicitly named, RegEx assigns the number as name
// This is different from C-Python, which returns None in such cases
return this.re._re.GroupNameFromNumber((int)lastindex);
}
}
#endregion
#region Private helper functions
private void AppendGroup(StringBuilder sb, int index) => sb.Append(_m.Groups[index].Value);
private int GetGroupIndex(object group) {
if (!Converter.TryConvertToInt32(group, out int grpIndex)) {
grpIndex = re._re.GroupNumberFromName(ValidateString(group, nameof(group)));
}
if (grpIndex < 0 || grpIndex >= _m.Groups.Count) {
throw PythonOps.IndexError("no such group");
}
return grpIndex;
}
#endregion
}
#endregion
#region Private helper functions
private static RE_Pattern GetPattern(CodeContext/*!*/ context, object pattern, int flags, bool compiled = false) {
if (pattern is RE_Pattern res) {
return res;
}
string strPattern = ValidatePatternAsString(pattern);
PatternKey key = new PatternKey(strPattern, flags);
lock (_cachedPatterns) {
if (_cachedPatterns.TryGetValue(new PatternKey(strPattern, flags), out res)) {
if (!compiled || res._re.Options.HasFlag(RegexOptions.Compiled)) {
return res;
}
}
res = new RE_Pattern(context, strPattern, flags, compiled);
_cachedPatterns[key] = res;
return res;
}
}
private static IEnumerator MatchIterator(MatchCollection matches, RE_Pattern pattern, string input) {
for (int i = 0; i < matches.Count; i++) {
yield return RE_Match.make(matches[i], pattern, input, 0, input.Length);
}
}
private static RegexOptions FlagsToOption(int flags) {
RegexOptions opts = RegexOptions.None;
if ((flags & (int)IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
if ((flags & (int)MULTILINE) != 0) opts |= RegexOptions.Multiline;
if (((flags & (int)LOCALE)) == 0) opts &= (~RegexOptions.CultureInvariant);
if ((flags & (int)DOTALL) != 0) opts |= RegexOptions.Singleline;
if ((flags & (int)VERBOSE) != 0) opts |= RegexOptions.IgnorePatternWhitespace;
return opts;
}
private static int OptionToFlags(RegexOptions options) {
int flags = 0;
if ((options & RegexOptions.IgnoreCase) != 0) {
flags |= IGNORECASE;
}
if ((options & RegexOptions.Multiline) != 0) {
flags |= MULTILINE;
}
if ((options & RegexOptions.CultureInvariant) == 0) {
flags |= LOCALE;
}
if ((options & RegexOptions.Singleline) != 0) {
flags |= DOTALL;
}
if ((options & RegexOptions.IgnorePatternWhitespace) != 0) {
flags |= VERBOSE;
}
return flags;
}
internal class ParsedRegex {
public ParsedRegex(string pattern) {
this.UserPattern = pattern;
}
public string UserPattern;
public string Pattern;
public RegexOptions Options = RegexOptions.CultureInvariant;
}
private static readonly char[] _endOfLineChars = new[] { '\r', '\n' };
private static readonly char[] _preParsedChars = new[] { '(', '{', '[', ']', '#' };
private const string _mangledNamedGroup = "___PyRegexNameMangled";
///
/// Preparses a regular expression text returning a ParsedRegex class
/// that can be used for further regular expressions.
///
private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string pattern, bool verbose) {
ParsedRegex res = new ParsedRegex(pattern);
//string newPattern;
int cur = 0, nameIndex;
int curGroup = 0;
bool isCharList = false;
bool containsNamedGroup = false;
bool inComment = false;
int groupCount = 0;
var namedGroups = new Dictionary();
for (; ; ) {
if (verbose && inComment) {
// read to end of line
inComment = false;
var idx = pattern.IndexOfAny(_endOfLineChars, cur);
if (idx < 0) break;
cur = idx;
}
nameIndex = pattern.IndexOfAny(_preParsedChars, cur);
if (nameIndex > 0 && pattern[nameIndex - 1] == '\\') {
int curIndex = nameIndex - 2;
int backslashCount = 1;
while (curIndex >= 0 && pattern[curIndex] == '\\') {
backslashCount++;
curIndex--;
}
// odd number of back slashes, this is an optional
// paren that we should ignore.
if ((backslashCount & 0x01) != 0) {
cur = ++nameIndex;
continue;
}
}
if (nameIndex == -1) break;
if (nameIndex == pattern.Length - 1) break;
switch (pattern[nameIndex]) {
case '{':
if (pattern[++nameIndex] == ',') {
// no beginning specified for the n-m quntifier, add the
// default 0 value.
pattern = pattern.Insert(nameIndex, "0");
}
break;
case '[':
nameIndex++;
isCharList = true;
break;
case ']':
nameIndex++;
isCharList = false;
break;
case '#':
if (verbose && !isCharList) {
inComment = true;
}
nameIndex++;
break;
case '(':
// make sure we're not dealing with [(]
if (!isCharList) {
groupCount++;
switch (pattern[++nameIndex]) {
case '?':
// extension syntax
if (nameIndex == pattern.Length - 1) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
switch (pattern[++nameIndex]) {
case 'P':
// named regex, .NET doesn't expect the P so we'll remove it;
// also, once we see a named group i.e. ?P then we need to start artificially
// naming all unnamed groups from then on---this is to get around the fact that
// the CLR RegEx support orders all the unnamed groups before all the named
// groups, even if the named groups are before the unnamed ones in the pattern;
// the artificial naming preserves the order of the groups and thus the order of
// the matches
if (nameIndex + 1 < pattern.Length && pattern[nameIndex + 1] == '=') {
// match whatever was previously matched by the named group
// remove the (?P=
pattern = pattern.Remove(nameIndex - 2, 4);
pattern = pattern.Insert(nameIndex - 2, "\\k<");
int tmpIndex = pattern.IndexOf(')', nameIndex);
if (tmpIndex == -1) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
pattern = pattern.Substring(0, tmpIndex) + ">" + pattern.Substring(tmpIndex + 1);
} else {
containsNamedGroup = true;
// we need to look and see if the named group was already seen and throw an error if it was
if (nameIndex + 1 < pattern.Length && pattern[nameIndex + 1] == '<') {
int tmpIndex = pattern.IndexOf('>', nameIndex);
if (tmpIndex == -1) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
var namedGroup = pattern.Substring(nameIndex + 2, tmpIndex - (nameIndex + 2));
if (namedGroups.ContainsKey(namedGroup)) {
throw PythonExceptions.CreateThrowable(error(context), $"redefinition of group name '{namedGroup}' as group {groupCount}; was group {namedGroups[namedGroup]}");
}
namedGroups[namedGroup] = groupCount;
}
pattern = pattern.Remove(nameIndex, 1);
}
break;
case 'i':
res.Options |= RegexOptions.IgnoreCase;
RemoveOption(ref pattern, ref nameIndex);
break;
case 'L':
res.Options &= ~(RegexOptions.CultureInvariant);
RemoveOption(ref pattern, ref nameIndex);
break;
case 'm':
res.Options |= RegexOptions.Multiline;
RemoveOption(ref pattern, ref nameIndex);
break;
case 's':
res.Options |= RegexOptions.Singleline;
RemoveOption(ref pattern, ref nameIndex);
break;
case 'u':
// specify unicode; not relevant and not valid under .NET as we're always unicode
// -- so the option needs to be removed
RemoveOption(ref pattern, ref nameIndex);
break;
case 'x':
res.Options |= RegexOptions.IgnorePatternWhitespace;
RemoveOption(ref pattern, ref nameIndex);
break;
case ':': break; // non-capturing
case '=': break; // look ahead assertion
case '<': break; // positive look behind assertion
case '!': break; // negative look ahead assertion
case '#': break; // inline comment
case '(':
// conditional match alternation (?(id/name)yes-pattern|no-pattern)
// move past ?( so we don't preparse the name.
nameIndex++;
break;
default: throw PythonExceptions.CreateThrowable(error(context), "Unrecognized extension " + pattern[nameIndex]);
}
break;
default:
// just another group
curGroup++;
if (containsNamedGroup) {
// need to name this unnamed group
pattern = pattern.Insert(nameIndex, "?<" + _mangledNamedGroup + GetRandomString() + ">");
}
break;
}
} else {
nameIndex++;
}
break;
}
cur = nameIndex;
}
cur = 0;
for (; ; ) {
nameIndex = pattern.IndexOf('\\', cur);
if (nameIndex == -1 || nameIndex == pattern.Length - 1) break;
cur = ++nameIndex;
char curChar = pattern[cur];
switch (curChar) {
case 'x':
case 'u':
case 'a':
case 'b':
case 'e':
case 'f':
case 'k':
case 'n':
case 'r':
case 't':
case 'v':
case 'c':
case 's':
case 'W':
case 'w':
case 'p':
case 'P':
case 'S':
case 'd':
case 'D':
case 'A':
case 'B':
case '\\':
// known escape sequences, leave escaped.
break;
case 'Z':
// /Z matches "end of string" in Python, replace with /z which is the .NET equivalent
pattern = pattern.Remove(cur, 1).Insert(cur, "z");
break;
default:
System.Globalization.UnicodeCategory charClass = CharUnicodeInfo.GetUnicodeCategory(curChar);
switch (charClass) {
// recognized word characters, always unescape.
case System.Globalization.UnicodeCategory.ModifierLetter:
case System.Globalization.UnicodeCategory.LowercaseLetter:
case System.Globalization.UnicodeCategory.UppercaseLetter:
case System.Globalization.UnicodeCategory.TitlecaseLetter:
case System.Globalization.UnicodeCategory.OtherLetter:
case System.Globalization.UnicodeCategory.LetterNumber:
case System.Globalization.UnicodeCategory.OtherNumber:
case System.Globalization.UnicodeCategory.ConnectorPunctuation:
pattern = pattern.Remove(nameIndex - 1, 1);
cur--;
break;
case System.Globalization.UnicodeCategory.DecimalDigitNumber:
// actually don't want to unescape '\1', '\2' etc. which are references to groups
break;
}
break;
}
if (++cur >= pattern.Length) {
break;
}
}
res.Pattern = pattern;
return res;
}
private static void RemoveOption(ref string pattern, ref int nameIndex) {
if (pattern[nameIndex - 1] == '?' && nameIndex < (pattern.Length - 1) && pattern[nameIndex + 1] == ')') {
pattern = pattern.Remove(nameIndex - 2, 4);
nameIndex -= 2;
} else {
pattern = pattern.Remove(nameIndex, 1);
nameIndex -= 2;
}
}
private static string GetRandomString() => r.Next(int.MaxValue / 2, int.MaxValue).ToString();
private static string UnescapeGroups(Match m, string text) {
for (int i = 0; i < text.Length; i++) {
if (text[i] == '\\') {
StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
do {
if (text[i] == '\\') {
i++;
if (i == text.Length) { sb.Append('\\'); break; }
switch (text[i]) {
case 'n': sb.Append('\n'); break;
case 'r': sb.Append('\r'); break;
case 't': sb.Append('\t'); break;
case '\\': sb.Append('\\'); break;
case '\'': sb.Append('\''); break;
case 'b': sb.Append('\b'); break;
case 'g':
// \g<#>, \g need to be substituted by the groups they
// matched
if (text[i + 1] == '<') {
int anglebrkStart = i + 1;
int anglebrkEnd = text.IndexOf('>', i + 2);
if (anglebrkEnd != -1) {
// grab the # or 'name' of the group between '< >'
int lengrp = anglebrkEnd - (anglebrkStart + 1);
string grp = text.Substring(anglebrkStart + 1, lengrp);
Group g;
if (StringUtils.TryParseInt32(grp, out int num)) {
g = m.Groups[num];
if (string.IsNullOrEmpty(g.Value)) {
throw PythonOps.IndexError("unknown group reference");
}
sb.Append(g.Value);
} else {
g = m.Groups[grp];
if (string.IsNullOrEmpty(g.Value)) {
throw PythonOps.IndexError("unknown group reference");
}
sb.Append(g.Value);
}
i = anglebrkEnd;
}
break;
}
sb.Append('\\');
sb.Append((char)text[i]);
break;
default:
if (char.IsDigit(text[i]) && text[i] <= '7') {
int val = 0;
int digitCount = 0;
while (i < text.Length && char.IsDigit(text[i]) && text[i] <= '7') {
digitCount++;
val += val * 8 + (text[i] - '0');
i++;
}
i--;
if (digitCount == 1 && val > 0 && val < m.Groups.Count) {
sb.Append(m.Groups[val].Value);
} else {
sb.Append((char)val);
}
} else {
sb.Append('\\');
sb.Append((char)text[i]);
}
break;
}
} else {
sb.Append(text[i]);
}
} while (++i < text.Length);
return sb.ToString();
}
}
return text;
}
private static object ValidatePattern(object pattern) {
switch (pattern)
{
case string s:
return s;
case ExtensibleString es:
return es.Value;
case Bytes bytes:
return bytes.MakeString();
case RE_Pattern rep:
return rep;
default:
throw PythonOps.TypeError("pattern must be a string or compiled pattern");
}
}
private static string ValidatePatternAsString(object pattern) {
switch (pattern)
{
case string s:
return s;
case ExtensibleString es:
return es.Value;
case Bytes bytes:
return bytes.MakeString();
case RE_Pattern rep:
return rep._pre.UserPattern;
default:
throw PythonOps.TypeError("pattern must be a string or compiled pattern");
}
}
private static string ValidateString(object str, string param) {
switch (str)
{
case string s:
return s;
case ExtensibleString es:
return es.Value;
case Bytes bytes:
return bytes.MakeString();
case ByteArray byteArray:
return byteArray.MakeString();
case ArrayModule.array array:
return Bytes.Make(array.ToByteArray()).MakeString();
#if FEATURE_MMAP
case MmapModule.MmapDefault mmapFile:
return mmapFile.GetSearchString().MakeString();
#endif
default:
throw PythonOps.TypeError($"expected string for parameter '{param}' but got '{PythonOps.GetPythonTypeName(str)}'");
}
}
private static PythonType error(CodeContext/*!*/ context) => (PythonType)context.LanguageContext.GetModuleState("reerror");
private readonly struct PatternKey : IEquatable {
public readonly string Pattern;
public readonly int Flags;
public PatternKey(string pattern, int flags) {
Pattern = pattern;
Flags = flags;
}
public override bool Equals(object obj) => obj is PatternKey key && Equals(key);
public override int GetHashCode() => Pattern.GetHashCode() ^ Flags;
#region IEquatable Members
public bool Equals(PatternKey other) => other.Pattern == Pattern && other.Flags == Flags;
#endregion
}
#endregion
}
}