Another alternative is to use a sort of a modified Lexer to isolate each of the discrete regions in your text where a certain replacement is warranted and marking that block so that replacements aren't run in it again
Here's an example of how you'd do that:
First, we'll create a class that indicates whether a particular string is used or not
public class UsageIndicator
{
public string Value { get; private set; }
public bool IsUsed { get; private set; }
public UsageIndicator(string value, bool isUsed)
{
Value = value;
IsUsed = isUsed;
}
public override string ToString()
{
return Value;
}
}
Then we'll define a class that represents both how to locate a "token" in your text and what to do when it's been found
public class TokenOperation
{
public Regex Pattern { get; private set; }
public Func<string, string> Mutator { get; private set; }
public TokenOperation(string pattern, Func<string, string> mutator)
{
Pattern = new Regex(pattern);
Mutator = mutator;
}
private List<UsageIndicator> ExtractRegions(string source, int index, int length, out int matchedIndex)
{
var result = new List<UsageIndicator>();
var head = source.Substring(0, index);
matchedIndex = 0;
if (head.Length > 0)
{
result.Add(new UsageIndicator(head, false));
matchedIndex = 1;
}
var body = source.Substring(index, length);
body = Mutator(body);
result.Add(new UsageIndicator(body, true));
var tail = source.Substring(index + length);
if (tail.Length > 0)
{
result.Add(new UsageIndicator(tail, false));
}
return result;
}
public void Match(List<UsageIndicator> source)
{
for (var i = 0; i < source.Count; ++i)
{
if (source[i].IsUsed)
{
continue;
}
var value = source[i];
var match = Pattern.Match(value.Value);
if (match.Success)
{
int modifyIBy;
source.RemoveAt(i);
var regions = ExtractRegions(value.Value, match.Index, match.Length, out modifyIBy);
for (var j = 0; j < regions.Count; ++j)
{
source.Insert(i + j, regions[j]);
}
i += modifyIBy;
}
}
}
}
After taking care of those things, putting something together to do the replacement is pretty simple
public class Rewriter
{
private readonly List<TokenOperation> _definitions = new List<TokenOperation>();
public void AddPattern(string pattern, Func<string, string> mutator)
{
_definitions.Add(new TokenOperation(pattern, mutator));
}
public void AddLiteral(string pattern, string replacement)
{
AddPattern(Regex.Escape(pattern), x => replacement);
}
public string Rewrite(string value)
{
var workingValue = new List<UsageIndicator> { new UsageIndicator(value, false) };
foreach (var definition in _definitions)
{
definition.Match(workingValue);
}
return string.Join("", workingValue);
}
}
In the demo code (below), keep in mind that the order in which pattern or literal expressions are added is important. The things that are added first get tokenized first, so, to prevent the :// in the url from getting picked off as an emoticon plus a slash, we process the image block first, as it'll contain the url between the tags and be marked as used before the emoticon rule can try to get it.
class Program
{
static void Main(string[] args)
{
var rewriter = new Rewriter();
rewriter.AddPattern(@"\[img\].*?\[/img\]", x => x.Replace("[img]", "<img src=\"").Replace("[/img]", "\"/>"));
rewriter.AddLiteral(":/", "<img src=\"emote-sigh.png\"/>");
rewriter.AddLiteral(":(", "<img src=\"emote-frown.png\"/>");
rewriter.AddLiteral(":P", "<img src=\"emote-tongue.png\"/>");
const string str = "Stacks be [img]http://example.com/overflowing.png[/img] :/";
Console.WriteLine(rewriter.Rewrite(str));
}
}
The sample prints:
Stacks be <img src="http://example.com/overflowing.png"/> <img src="emote-sigh.png"/>