如何在 C# 中将 rtf 字符串转换为文本
Posted
技术标签:
【中文标题】如何在 C# 中将 rtf 字符串转换为文本【英文标题】:How to convert an rtf string to text in C# 【发布时间】:2021-12-22 21:42:56 【问题描述】:有没有不使用RichTextBox 从Rtf 字符串中提取文本的简单方法?
例子:
\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2\fonttbl\f0\fcharset0 Times New Roman;\f2\fcharset0 Segoe UI;\colortbl\red0\green0\blue0;\red255\green255\blue255;\loch\hich\dbch\pard\plain\ltrpar\itap0\lang1033\fs18\f2\cf0 \cf0\ql\f2 \lang2070\ltrch foo\li0\ri0\sa0\sb0\fi0\ql\par
\f2 \lang2070\ltrch bar \li0\ri0\sa0\sb0\fi0\ql\par
应该返回:
foo
bar
【问题讨论】:
您的意思是“不使用 RichTextBox”还是“不显示 RichTextBox”? 见***.com/questions/188545/…,基本上你可以用RegExp做到这一点 不使用 RichTextBox。这将在报告服务器加载的 dll 上。如果包含 windows.forms,则 dll 将返回错误 【参考方案1】:如何在不引用其他库的情况下在纯 C# 中做到这一点:
这个人写了一个类,按照 OP 的要求将 RTF 剥离为纯文本。 这是source
这是他的代码:
/// <summary>
/// Rich Text Stripper
/// </summary>
/// <remarks>
/// Translated from Python located at:
/// http://***.com/a/188877/448
/// </remarks>
public static class RichTextStripper
private class StackEntry
public int NumberOfCharactersToSkip get; set;
public bool Ignorable get; set;
public StackEntry(int numberOfCharactersToSkip, bool ignorable)
NumberOfCharactersToSkip = numberOfCharactersToSkip;
Ignorable = ignorable;
private static readonly Regex _rtfRegex = new Regex(@"\\([a-z]1,32)(-?\d1,10)?[ ]?|\\'([0-9a-f]2)|\\([^a-z])|([])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);
private static readonly List<string> destinations = new List<string>
"aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
"atnparent","atnref","atntime","atrfend","atrfstart","author","background",
"bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
"colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
"do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
"fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
"ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
"fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
"footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
"header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
"hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
"leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
"listoverridetable","listpicture","liststylename","listtable","listtext",
"lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
"mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
"mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
"mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
"mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
"mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
"mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
"mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
"mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
"mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
"mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
"mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
"mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
"mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
"msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
"msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
"mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
"objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
"oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
"passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
"pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
"result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
"shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
"svb","tc","template","themedata","title","txe","ud","upr","userprops",
"wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
"xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
"xmlopen"
;
private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
"par", "\n" ,
"sect", "\n\n" ,
"page", "\n\n" ,
"line", "\n" ,
"tab", "\t" ,
"emdash", "\u2014" ,
"endash", "\u2013" ,
"emspace", "\u2003" ,
"enspace", "\u2002" ,
"qmspace", "\u2005" ,
"bullet", "\u2022" ,
"lquote", "\u2018" ,
"rquote", "\u2019" ,
"ldblquote", "\u201C" ,
"rdblquote", "\u201D" ,
;
/// <summary>
/// Strip RTF Tags from RTF Text
/// </summary>
/// <param name="inputRtf">RTF formatted text</param>
/// <returns>Plain text from RTF</returns>
public static string StripRichTextFormat(string inputRtf)
if (inputRtf == null)
return null;
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
foreach (Match match in matches)
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
curskip = 0;
if (brace == "")
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
else if (brace == "")
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
curskip = 0;
if (character == "~")
if (!ignorable)
outList.Add("\xA0");
else if ("\\".Contains(character))
if (!ignorable)
outList.Add(character);
else if (character == "*")
ignorable = true;
else if (!String.IsNullOrEmpty(word)) // \foo
curskip = 0;
if (destinations.Contains(word))
ignorable = true;
else if (ignorable)
else if (specialCharacters.ContainsKey(word))
outList.Add(specialCharacters[word]);
else if (word == "uc")
ucskip = Int32.Parse(arg);
else if (word == "u")
int c = Int32.Parse(arg);
if (c < 0)
c += 0x10000;
outList.Add(Char.ConvertFromUtf32(c));
curskip = ucskip;
else if (!String.IsNullOrEmpty(hex)) // \'xx
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
else if (!String.IsNullOrEmpty(tchar))
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
outList.Add(tchar);
else
// Didn't match the regex
returnString = inputRtf;
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
编辑 1: 与此同时,我们让这段代码运行在生产环境中进行测试和改编版本。新版本做了一些额外的安全检查并更好地处理新线路。
public static string StripRichTextFormat(string inputRtf)
if (inputRtf == null)
return null;
string returnString;
var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.
MatchCollection matches = _rtfRegex.Matches(inputRtf);
if (matches.Count > 0)
foreach (Match match in matches)
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;
if (!String.IsNullOrEmpty(brace))
curskip = 0;
if (brace == "")
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
else if (brace == "")
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
curskip = 0;
if (character == "~")
if (!ignorable)
outList.Add("\xA0");
else if ("\\".Contains(character))
if (!ignorable)
outList.Add(character);
else if (character == "*")
ignorable = true;
else if (!String.IsNullOrEmpty(word)) // \foo
curskip = 0;
if (destinations.Contains(word))
ignorable = true;
else if (ignorable)
else if (specialCharacters.ContainsKey(word))
outList.Add(specialCharacters[word]);
else if (word == "uc")
ucskip = Int32.Parse(arg);
else if (word == "u")
int c = Int32.Parse(arg);
if (c < 0)
c += 0x10000;
//Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
outList.Add(Char.ConvertFromUtf32(c));
else outList.Add("?");
curskip = ucskip;
else if (!String.IsNullOrEmpty(hex)) // \'xx
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
outList.Add(Char.ConvertFromUtf32(c));
else if (!String.IsNullOrEmpty(tchar))
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
outList.Add(tchar);
else
// Didn't match the regex
returnString = inputRtf;
returnString = String.Join(String.Empty, outList.ToArray());
return returnString;
【讨论】:
【参考方案2】:MSDN上有一篇简单的文章可以实现你想要的:http://msdn.microsoft.com/en-us/library/cc488002.aspx
class ConvertFromRTF
static void Main()
string path = @"test.rtf";
//Create the RichTextBox. (Requires a reference to System.Windows.Forms.dll.)
System.Windows.Forms.RichTextBox rtBox = new System.Windows.Forms.RichTextBox();
// Get the contents of the RTF file. Note that when it is
// stored in the string, it is encoded as UTF-16.
string s = System.IO.File.ReadAllText(path);
// Display the RTF text.
System.Windows.Forms.MessageBox.Show(s);
// Convert the RTF to plain text.
rtBox.Rtf = s;
string plainText = rtBox.Text;
// Display plain text output in MessageBox because console
// cannot display Greek letters.
System.Windows.Forms.MessageBox.Show(plainText);
// Output plain text to file, encoded as UTF-8.
System.IO.File.WriteAllText(@"output.txt", plainText);
【讨论】:
我想避免引用 System.Windows.Forms.dll 这使用了一个 RichTextBox,OP 说他想避免。虽然我同意这样做的最佳方法是不显示 RTB。【参考方案3】:不同意在此类任务中使用 RichTextBox 或任何其他控件。这是另一种方法:
public string RtfToPlainText(string rtf)
var flowDocument = new FlowDocument();
var textRange = new TextRange(flowDocument.ContentStart, flowDocument.ContentEnd);
using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(rtf ?? string.Empty)))
textRange.Load(stream, DataFormats.Rtf);
return textRange.Text;
【讨论】:
非常有趣,但是 afaik FlowDocument 是一个 WPF 控件,那么有什么优势呢?以上是关于如何在 C# 中将 rtf 字符串转换为文本的主要内容,如果未能解决你的问题,请参考以下文章