在 C# 中读取 .txt 文件时出现问题,从(在立即空整行之后)到(下一个空整行)
problem reading .txt file in C# starting from (after immediate empty whole line) to (the next empty whole line)
我正在尝试随机读取一个巨大的 .txt 文件。它有大量的段落,由一整行空白 space 和每个段落 post 分隔。我希望每次我随机阅读它时,它都会拉出一个完整的完整段落,为了上下文而不会遗漏任何字符或单词。我提前感谢您的帮助。
我添加了一个 for 循环只是为了测试它,看看我是否可以在某个时候包括一种连续识别 运行 空 space 的方法。那只会起作用 post 如果应用的话,显然已经选择了起点。
public static string GetRandomLine(string filename)
{
var lines = File.ReadAllLines(filename);
var lineNumber = _rand.Next(0, lines.Length);
string reply = lines[lineNumber];
return reply ;
}
获得完整的段落
public static string GetRandomParagraph(string fileName)
{
/*
Rather than reading all the lines, read all the text
this gives you the ability to split by paragraph
*/
var allText = File.ReadAllText(fileName);
// Use as separator for paragraphs
var paragraphSeparator = $"{Environment.NewLine}{Environment.NewLine}";
// Treat large white spaces after a new line as separate paragraphs
allText = Regex.Replace(allText, @"(\n\s{3,})", paragraphSeparator);
// Split the text into paragraphs
var paragraphs = allText.Split(paragraphSeparator);
// Get a random index between 0 and the amount of paragraphs
var randomParagraph = new Random().Next(0, paragraphs.Length);
return paragraphs[randomParagraph];
}
// This builds a list of Paragraph first
public static List<string> GetParagraphs(string filename)
{
var paragraphs = new List<string>();
var lines = File.ReadAllLines(filename);
bool newParagraph = true;
string CurrentParagraph = string.Empty;
// Build the list of paragraphs by adding to the currentParagraph until empty lines and then starting a new one
foreach(var line in lines)
{
if(newParagraph)
{
CurrentParagraph = line;
newParagraph = false;
}
else
{
if(string.IsNullOrWhiteSpace(line))// we're starting a new paragraph, add it to the list of paragraphs and reset current paragraph for next one
{
paragraphs.Add(CurrentParagraph);
CurrentParagraph = string.Empty;
newParagraph = true;
}
else // we're still in the same paragraph, add the line to current paragraph
{
newParagraph += (Environment.NewLine + line);
}
}
}
// Careful, if your file doesn't end with a newline the last paragraph won't count as one, in that case add it manually here.
}
public static Random rnd = new Random();
// And this returns a random one
public static string GetRandomParagraph(string fileName)
{
var allParagraphs = GetParagraphs(filename);
allParagraphs[rnd.Next(0,allParagraphs.length-1)]; // pick one of the paragraphs at random, stop at length-1 as collection indexers are 0 based
}
请注意,如果您总是从同一个文件中读取数据,只需调用一次 GetParagraphs 并将段落列表保存在内存中,速度可能会快得多。
试试这个:
public static string GetRandomLine(string filename)
{
var lines = File.ReadAllLines(filename);
var lineNumber = _rand.Next(0, lines.Length - 1);
var blankBefore = lineNumber;
var blankAfter = lineNumber + 1;
string reply = "";
while (lines[blankBefore].Length > 0)
{
blankBefore--;
}
while (lines[blankAfter].Length != 0)
{
blankAfter++;
}
for ( int i = blankBefore + 1; blankBefore < blankAfter; blankBefore++)
{
reply += lines[i];
}
return reply;
}
根据您的描述,我假设文件以一个空行开始和结束。通过将随机行的独占上限设置为比行的长度小一,可以避免随机行成为文件最后一行的可能性。如果随机行是空行,blankBefore 将是该行的索引,否则,将回溯直到到达前一个空行。 blankAfter 作为随机行之后的下一行的索引开始,如果该行不是空白,则 blankAfter 增加直到它是下一个空白行的索引。
获得目标段落前后空行的索引后,只需在它们之间添加空行即可回复。
如果文件的第一行和最后一行不是空白,您需要验证 blankBefore 和 blankAfter 是否在数组的范围内。
尝试以下操作:
public static string[] GetRandomParagraph(string filePath)
{
if (File.Exists(filePath))
{
string text = File.ReadAllText(filePath);
string[] paragraphs = text.Split(new string[] { "\n\n" }, StringSplitOptions.None);
return paragraphs[new Random().Next(0, paragraphs.Length)].Split('\n');
}
else
throw new FileNotFoundException("The file was not found", filePath);
}
我真的希望这就是您要找的。
我对@TheCoderCrab 上面提供的代码做了一些修改。我将该方法转换为字符串方法,因此它会 return 一个字符串。我只是添加了一个 for 循环,将段落数组的所有字符追加到一个新字符串,该字符串 return 将它添加到主字符串中。谢谢。
public static string GetRandomParagraph(string filePath)
{
if (File.Exists(filePath))
{
string text = File.ReadAllText(filePath);
string[] paragraphs = text.Split(new string[] { "\n\n" }, StringSplitOptions.None);
string [] paragraph = paragraphs[new Random().Next(0, paragraphs.Length)].Split('\n');
//Added a for loop to build the string out of all the characters in the 'paragraph' array index.
string pReturn = "";
for (int a = 0; a < paragraph.Length; a++)
{
//Loop through and consecutively append each character of mapped array index to a return string 'pReturn'
pReturn = pReturn + paragraph[a].ToString();
}
return pReturn;
}
else
throw new FileNotFoundException("The file was not found", filePath);
}
我正在尝试随机读取一个巨大的 .txt 文件。它有大量的段落,由一整行空白 space 和每个段落 post 分隔。我希望每次我随机阅读它时,它都会拉出一个完整的完整段落,为了上下文而不会遗漏任何字符或单词。我提前感谢您的帮助。
我添加了一个 for 循环只是为了测试它,看看我是否可以在某个时候包括一种连续识别 运行 空 space 的方法。那只会起作用 post 如果应用的话,显然已经选择了起点。
public static string GetRandomLine(string filename)
{
var lines = File.ReadAllLines(filename);
var lineNumber = _rand.Next(0, lines.Length);
string reply = lines[lineNumber];
return reply ;
}
获得完整的段落
public static string GetRandomParagraph(string fileName)
{
/*
Rather than reading all the lines, read all the text
this gives you the ability to split by paragraph
*/
var allText = File.ReadAllText(fileName);
// Use as separator for paragraphs
var paragraphSeparator = $"{Environment.NewLine}{Environment.NewLine}";
// Treat large white spaces after a new line as separate paragraphs
allText = Regex.Replace(allText, @"(\n\s{3,})", paragraphSeparator);
// Split the text into paragraphs
var paragraphs = allText.Split(paragraphSeparator);
// Get a random index between 0 and the amount of paragraphs
var randomParagraph = new Random().Next(0, paragraphs.Length);
return paragraphs[randomParagraph];
}
// This builds a list of Paragraph first
public static List<string> GetParagraphs(string filename)
{
var paragraphs = new List<string>();
var lines = File.ReadAllLines(filename);
bool newParagraph = true;
string CurrentParagraph = string.Empty;
// Build the list of paragraphs by adding to the currentParagraph until empty lines and then starting a new one
foreach(var line in lines)
{
if(newParagraph)
{
CurrentParagraph = line;
newParagraph = false;
}
else
{
if(string.IsNullOrWhiteSpace(line))// we're starting a new paragraph, add it to the list of paragraphs and reset current paragraph for next one
{
paragraphs.Add(CurrentParagraph);
CurrentParagraph = string.Empty;
newParagraph = true;
}
else // we're still in the same paragraph, add the line to current paragraph
{
newParagraph += (Environment.NewLine + line);
}
}
}
// Careful, if your file doesn't end with a newline the last paragraph won't count as one, in that case add it manually here.
}
public static Random rnd = new Random();
// And this returns a random one
public static string GetRandomParagraph(string fileName)
{
var allParagraphs = GetParagraphs(filename);
allParagraphs[rnd.Next(0,allParagraphs.length-1)]; // pick one of the paragraphs at random, stop at length-1 as collection indexers are 0 based
}
请注意,如果您总是从同一个文件中读取数据,只需调用一次 GetParagraphs 并将段落列表保存在内存中,速度可能会快得多。
试试这个:
public static string GetRandomLine(string filename)
{
var lines = File.ReadAllLines(filename);
var lineNumber = _rand.Next(0, lines.Length - 1);
var blankBefore = lineNumber;
var blankAfter = lineNumber + 1;
string reply = "";
while (lines[blankBefore].Length > 0)
{
blankBefore--;
}
while (lines[blankAfter].Length != 0)
{
blankAfter++;
}
for ( int i = blankBefore + 1; blankBefore < blankAfter; blankBefore++)
{
reply += lines[i];
}
return reply;
}
根据您的描述,我假设文件以一个空行开始和结束。通过将随机行的独占上限设置为比行的长度小一,可以避免随机行成为文件最后一行的可能性。如果随机行是空行,blankBefore 将是该行的索引,否则,将回溯直到到达前一个空行。 blankAfter 作为随机行之后的下一行的索引开始,如果该行不是空白,则 blankAfter 增加直到它是下一个空白行的索引。
获得目标段落前后空行的索引后,只需在它们之间添加空行即可回复。
如果文件的第一行和最后一行不是空白,您需要验证 blankBefore 和 blankAfter 是否在数组的范围内。
尝试以下操作:
public static string[] GetRandomParagraph(string filePath)
{
if (File.Exists(filePath))
{
string text = File.ReadAllText(filePath);
string[] paragraphs = text.Split(new string[] { "\n\n" }, StringSplitOptions.None);
return paragraphs[new Random().Next(0, paragraphs.Length)].Split('\n');
}
else
throw new FileNotFoundException("The file was not found", filePath);
}
我真的希望这就是您要找的。
我对@TheCoderCrab 上面提供的代码做了一些修改。我将该方法转换为字符串方法,因此它会 return 一个字符串。我只是添加了一个 for 循环,将段落数组的所有字符追加到一个新字符串,该字符串 return 将它添加到主字符串中。谢谢。
public static string GetRandomParagraph(string filePath)
{
if (File.Exists(filePath))
{
string text = File.ReadAllText(filePath);
string[] paragraphs = text.Split(new string[] { "\n\n" }, StringSplitOptions.None);
string [] paragraph = paragraphs[new Random().Next(0, paragraphs.Length)].Split('\n');
//Added a for loop to build the string out of all the characters in the 'paragraph' array index.
string pReturn = "";
for (int a = 0; a < paragraph.Length; a++)
{
//Loop through and consecutively append each character of mapped array index to a return string 'pReturn'
pReturn = pReturn + paragraph[a].ToString();
}
return pReturn;
}
else
throw new FileNotFoundException("The file was not found", filePath);
}