代码之家  ›  专栏  ›  技术社区  ›  mpen

扩展字符集的算法?

c#
  •  3
  • mpen  · 技术社区  · 14 年前

    有没有现成的函数来扩展C#regex样式的字符集?

    例如, expand("a-z1") 将返回一个字符串,该字符串包含从a到z的所有字符,后跟数字1。


    到目前为止我得到的是:

    public static string ExpandCharacterSet(string set)
    {
        var sb = new StringBuilder();
        int start = 0;
        while (start < set.Length - 1)
        {
            int dash = set.IndexOf('-', start + 1);
    
            if (dash <= 0 || dash >= set.Length - 1)
                break;
    
            sb.Append(set.Substring(start, dash - start - 1));
    
            char a = set[dash - 1];
            char z = set[dash + 1];
    
            for (var i = a; i <= z; ++i)
                sb.Append(i);
    
            start = dash + 2;
        }
    
        sb.Append(set.Substring(start));
        return sb.ToString();
    }
    

    有什么我忽略的吗?

    附言: 现在让我们忽略负字符集。


    我觉得我的例子很清楚。。。让我们再试一次。这就是我想要的:

    ExpandCharacterSet("a-fA-F0-9") == "abcdefABCDEF0123456789"
    
    4 回复  |  直到 11 年前
        1
  •  2
  •   Jeff Mercado    14 年前

    我花了一点工夫才弄到这个,但这是我能收集到的。当然,这不会是便携式的,因为我在搞乱内部。但它对于简单的测试用例来说已经足够好了。它将接受任何正则表达式字符类,但不适用于求反的类。价值观的范围太广,没有任何限制。我不知道它是否适用于所有情况,它根本不能处理重复,但这只是一个开始。至少你不必推出自己的解析器。从.NET Framework 4.0开始:

    public static class RegexHelper
    {
        public static string ExpandCharClass(string charClass)
        {
            var regexParser = new RegexParser(CultureInfo.CurrentCulture);
            regexParser.SetPattern(charClass);
            var regexCharClass = regexParser.ScanCharClass(false);
            int count = regexCharClass.RangeCount();
            List<string> ranges = new List<string>();
            // range 0 can be skipped
            for (int i = 1; i < count; i++)
            {
                var range = regexCharClass.GetRangeAt(i);
                ranges.Add(ExpandRange(range));
            }
            return String.Concat(ranges);
        }
    
        static string ExpandRange(SingleRange range)
        {
            char first = range._first;
            char last = range._last;
            return String.Concat(Enumerable.Range(first, last - first + 1).Select(i => (char)i));
        }
    
        internal class RegexParser
        {
            static readonly Type RegexParserType;
            static readonly ConstructorInfo RegexParser_Ctor;
            static readonly MethodInfo RegexParser_SetPattern;
            static readonly MethodInfo RegexParser_ScanCharClass;
    
            static RegexParser()
            {
                RegexParserType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexParser");
    
                var flags = BindingFlags.NonPublic | BindingFlags.Instance;
                RegexParser_Ctor = RegexParserType.GetConstructor(flags, null, new[] { typeof(CultureInfo) }, null);
                RegexParser_SetPattern = RegexParserType.GetMethod("SetPattern", flags, null, new[] { typeof(String) }, null);
                RegexParser_ScanCharClass = RegexParserType.GetMethod("ScanCharClass", flags, null, new[] { typeof(Boolean) }, null);
            }
    
            private readonly object instance;
    
            internal RegexParser(CultureInfo culture)
            {
                instance = RegexParser_Ctor.Invoke(new object[] { culture });
            }
    
            internal void SetPattern(string pattern)
            {
                RegexParser_SetPattern.Invoke(instance, new object[] { pattern });
            }
    
            internal RegexCharClass ScanCharClass(bool caseInsensitive)
            {
                return new RegexCharClass(RegexParser_ScanCharClass.Invoke(instance, new object[] { caseInsensitive }));
            }
        }
    
        internal class RegexCharClass
        {
            static readonly Type RegexCharClassType;
            static readonly MethodInfo RegexCharClass_RangeCount;
            static readonly MethodInfo RegexCharClass_GetRangeAt;
    
            static RegexCharClass()
            {
                RegexCharClassType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexCharClass");
    
                var flags = BindingFlags.NonPublic | BindingFlags.Instance;
                RegexCharClass_RangeCount = RegexCharClassType.GetMethod("RangeCount", flags, null, new Type[] { }, null);
                RegexCharClass_GetRangeAt = RegexCharClassType.GetMethod("GetRangeAt", flags, null, new[] { typeof(Int32) }, null);
            }
    
            private readonly object instance;
    
            internal RegexCharClass(object regexCharClass)
            {
                if (regexCharClass == null)
                    throw new ArgumentNullException("regexCharClass");
                if (regexCharClass.GetType() != RegexCharClassType)
                    throw new ArgumentException("not an instance of a RegexCharClass object", "regexCharClass");
                instance = regexCharClass;
            }
    
            internal int RangeCount()
            {
                return (int)RegexCharClass_RangeCount.Invoke(instance, new object[] { });
            }
    
            internal SingleRange GetRangeAt(int i)
            {
                return new SingleRange(RegexCharClass_GetRangeAt.Invoke(instance, new object[] { i }));
            }
        }
    
        internal struct SingleRange
        {
            static readonly Type RegexCharClassSingleRangeType;
            static readonly FieldInfo SingleRange_first;
            static readonly FieldInfo SingleRange_last;
    
            static SingleRange()
            {
                RegexCharClassSingleRangeType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexCharClass+SingleRange");
    
                var flags = BindingFlags.NonPublic | BindingFlags.Instance;
                SingleRange_first = RegexCharClassSingleRangeType.GetField("_first", flags);
                SingleRange_last = RegexCharClassSingleRangeType.GetField("_last", flags);
            }
    
            internal char _first;
            internal char _last;
    
            internal SingleRange(object singleRange)
            {
                if (singleRange == null)
                    throw new ArgumentNullException("singleRange");
                if (singleRange.GetType() != RegexCharClassSingleRangeType)
                    throw new ArgumentException("not an instance of a SingleRange object", "singleRange");
                _first = (char)SingleRange_first.GetValue(singleRange);
                _last = (char)SingleRange_last.GetValue(singleRange);
            }
        }
    }
    
    // usage:
    RegexHelper.ExpandCharClass(@"[\-a-zA-F1 5-9]");
    // "-abcdefghijklmnopqrstuvwxyzABCDEF1 56789"
    
        2
  •  1
  •   Seth    14 年前

    这似乎是一个非常不寻常的要求,但是由于只有大约96个字符可以匹配(除非包含高位字符),所以您不妨对所有字符测试正则表达式,然后输出匹配结果:

    public static string expando(string input_re) {
    
        // add more chars in s as needed, such as ,.?/|=+_-éñ etc.
        string s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
        string output = "";
    
        Regex exp = new Regex(input_re);
    
        for (int i = 0; i < s.Length; i++) {
            if (exp.IsMatch(s.Substring(i, 1))) {
                output += s[i];
            }
        }
    
        return output;
    }
    

    通过使用一个实际的正则表达式来确定字符类,您可以展开任何想要的正则表达式, [^A-B]|[0123a-cg-h] ,例如。

        3
  •  1
  •   porges    14 年前

    像这样的?

    var input = "a-fA-F0-9!";
    var matches = Regex.Matches(input,@".-.|.");
    
    var list = new StringBuilder();
    
    foreach (Match m in matches)
    {
        var value = m.Value;
    
        if (value.Length == 1)
            list.Append(value);
        else
        {
            if (value[2] < value[0]) throw new ArgumentException("invalid format"); // or switch, if you want.
            for (char c = value[0]; c <= value[2]; c++)
                list.Append(c);
        }
    }
    
    Console.WriteLine(list);
    

    输出:

    abcdefABCDEF0123456789!
    

    当然,这其中的寓意是用更多的正则表达式来解决你的正则表达式问题!


    这是一个支持转义字符的版本。这完全取决于你希望它有多健壮。。。例如,我在这里没有做任何特别的事情来处理代理,所以这可能行不通。此外,如果您正试图与当前regex引擎的性能完全匹配,则需要确切地知道所有参数是什么,这将是一项相当大的工作。

    void Main()
    {
            //these are all equivalent:
            var input = @"\x41-\0x46\u41";
            var input2 = @"\65-\70\65";
            var input3 = "A-FA";
    
            // match hex as \0x123 or \x123 or \u123, or decimal \412, or the escapes \n\t\r, or any character
            var charRegex = @"(\\(0?x|u)[0-9a-fA-F]+|\\[0-9]+|\\[ntr]|.)";
            var matches = Regex.Matches(input, charRegex + "-" + charRegex + "|" + charRegex);
    
            var list = new StringBuilder();
    
            foreach (Match m in matches)
            {
                var dashIndex = m.Value.IndexOf('-', 1); //don't look at 0 (in case it's a dash)
                if (dashIndex > 0) // this means we have two items: a range
                {   
                    var charLeft = Decode(m.Value.Substring(0,dashIndex));
                    var charRight = Decode(m.Value.Substring(dashIndex+1));
                    if (charRight < charLeft) throw new ArgumentException("invalid format (left bigger than right)"); // or switch, if you want.
                    for (char c = charLeft; c <= charRight; c++)
                        list.Append(c);
                }
                else // just one item
                {
                    list.Append(Decode(m.Value));
                }   
            }
    
            Console.WriteLine(list);
    }
    
    char Decode(string s)
    {
        if (s.Length == 1)
            return s[0];
    
        // here, s[0] == '\', because of the regex
        if (s.Length == 2)
            switch (s[1])
            {
            // incomplete; add more as wished
            case 'n': return '\n';
            case 't': return '\t';
            case 'r': return '\r';
            default: break;
            }
    
        if (s[1] == 'u' || s[1] == 'x')
            return (char)Convert.ToUInt16(s.Substring(2), 16);
        else if (s.Length > 2 && s[1] == '0' && s[2] == 'x')
            return (char)Convert.ToUInt16(s.Substring(3), 16);
        else
            return (char)Convert.ToUInt16(s.Substring(1)); // will fail from here if invalid escape (e.g. \g)
    }
    
        4
  •  0
  •   mpen    14 年前
    private static readonly IEnumerable<char> CharacterSet = Enumerable.Range(0, char.MaxValue + 1).Select(Convert.ToChar).Where(c => !char.IsControl(c));
    
    public static string ExpandCharacterSet(string set)
    {
        var sb = new StringBuilder();
        int start = 0;
        bool invertSet = false;
    
        if (set.Length == 0) 
            return "";
        if (set[0] == '[' && set[set.Length - 1] == ']')
            set = set.Substring(1, set.Length - 2);
        if (set[0] == '^')
        {
            invertSet = true;
            set = set.Substring(1);
        }
    
        while (start < set.Length - 1)
        {
            int dash = set.IndexOf('-', start + 1);
    
            if (dash <= 0 || dash >= set.Length - 1)
                break;
    
            sb.Append(set.Substring(start, dash - start - 1));
    
            char a = set[dash - 1];
            char z = set[dash + 1];
    
            for (var i = a; i <= z; ++i)
                sb.Append(i);
    
            start = dash + 2;
        }
    
        sb.Append(set.Substring(start));
    
        if (!invertSet) return sb.ToString();
    
        var A = new HashSet<char>(CharacterSet);
        var B = new HashSet<char>(sb.ToString());
        A.ExceptWith(B);
        return new string(A.ToArray());
    }