使用 iTextSharp 5.5.8 将 html 转换为 PDF 时不显示希伯来语内容?
Hebrew content not displayed when converting html to PDF using iTextSharp 5.5.8?
我正在使用以下代码使用 iTextSharp
将 Html 文件转换为 Pdf
Document doc = new Document(iTextSharp.text.PageSize.A4, 10, 20, 5, 35);
var writer = PdfWriter.GetInstance(doc, new FileStream(savePath, FileMode.Create));
var xmlWorkerFontProvider = new XMLWorkerFontProvider();
var cssAppliers = new CssAppliersImpl(new MyFontProvider());
CssFilesImpl cssFiles = new CssFilesImpl();
StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
htmlContext.SetImageProvider(new ITextImageHandler());
IPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext, new PdfWriterPipeline(doc, writer)));
XMLWorker worker = new XMLWorker(pipeline, true);
XMLParser xmlParser = new XMLParser(true, worker, Encoding.Unicode);
doc.Open();
doc.NewPage();
xmlParser.Parse(new StringReader(htmlString.ToString()));
doc.Close();
对于英文内容,这工作正常。但如果内容是希伯来语,则 PDF 中不会显示文本。
我已经在 Stack-overflow 上检查了与此相关的其他答案,但他们似乎使用了 HtmlParser,但已弃用。所以我不想用那个。
如果还有其他需要,请告诉我。谢谢你的时间。
编辑:阅读评论后,我也尝试设置字体。但仍然没有运气。下面是更新后的代码。
Document document = new Document();
PdfWriter writer =
PdfWriter.GetInstance(document, new FileStream(savePath, FileMode.Create));
document.Open();
var cssResolver = new StyleAttrCSSResolver();
XMLWorkerFontProvider fontProvider =
new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
fontProvider.Register(@"E:\fonts\NotoSansHebrew-Regular.ttf");
CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
htmlContext.SetImageProvider(new ITextImageHandler());
PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
XMLWorker worker = new XMLWorker(css, true);
XMLParser p = new XMLParser(worker);
p.Parse(new StringReader(htmlString.ToString()));
document.Close();
下面是 B运行o 的代码与一些实际 HTML 的改编。要 运行 您只需要下载字体 Noto Sans Hebrew 并将其放在您的桌面上。在不进行任何修改(可能文件路径除外)的情况下,尝试 运行ning 这段对我有用的代码。 (我针对 5.5.5 对此进行了测试,因此 5.5.8 应该绝对有效。)
var file = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.pdf");
var fontFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "NotoSansHebrew-Regular.ttf");
var htmlText = @"<div dir=""rtl"" style=""font-family: Noto Sans Hebrew;"">שלום עולם</div>";
using (var FS = new System.IO.FileStream(file, FileMode.Create, FileAccess.Write, FileShare.None)) {
using (var document = new Document()) {
using (var writer = PdfWriter.GetInstance(document, FS)) {
document.Open();
var cssResolver = new StyleAttrCSSResolver();
var fontProvider = new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
fontProvider.Register(fontFile);
var cssAppliers = new CssAppliersImpl(fontProvider);
var htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
var pdf = new PdfWriterPipeline(document, writer);
var html = new HtmlPipeline(htmlContext, pdf);
var css = new CssResolverPipeline(cssResolver, html);
var worker = new XMLWorker(css, true);
var p = new XMLParser(worker);
using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(htmlText))) {
using (var sr = new StreamReader(ms)) {
p.Parse(sr);
}
}
document.Close();
}
}
}
整个事情的诀窍是在您的 HTML 中获取字体文件中的准确字体名称。有时令人困惑的是,字体实际上可以包含一堆名称。字体越旧,它就越有可能拥有这些。如果我没记错的话,iText 有一些用于确定字体名称的启发式方法,但如果您想安全地使用它,您也可以使用别名并随意调用它。例如,您可以将 HTML 更改为:
var htmlText = @"<div dir=""rtl"" style=""font-family: Gerp;"">שלום עולם</div>";
只要您在注册字体时使用别名,一切都会正常进行:
fontProvider.Register(fontFile, "Gerp");
我正在使用以下代码使用 iTextSharp
将 Html 文件转换为 Pdf Document doc = new Document(iTextSharp.text.PageSize.A4, 10, 20, 5, 35);
var writer = PdfWriter.GetInstance(doc, new FileStream(savePath, FileMode.Create));
var xmlWorkerFontProvider = new XMLWorkerFontProvider();
var cssAppliers = new CssAppliersImpl(new MyFontProvider());
CssFilesImpl cssFiles = new CssFilesImpl();
StyleAttrCSSResolver cssResolver = new StyleAttrCSSResolver(cssFiles);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
htmlContext.SetImageProvider(new ITextImageHandler());
IPipeline pipeline = new CssResolverPipeline(cssResolver, new HtmlPipeline(htmlContext, new PdfWriterPipeline(doc, writer)));
XMLWorker worker = new XMLWorker(pipeline, true);
XMLParser xmlParser = new XMLParser(true, worker, Encoding.Unicode);
doc.Open();
doc.NewPage();
xmlParser.Parse(new StringReader(htmlString.ToString()));
doc.Close();
对于英文内容,这工作正常。但如果内容是希伯来语,则 PDF 中不会显示文本。
我已经在 Stack-overflow 上检查了与此相关的其他答案,但他们似乎使用了 HtmlParser,但已弃用。所以我不想用那个。
如果还有其他需要,请告诉我。谢谢你的时间。
编辑:阅读评论后,我也尝试设置字体。但仍然没有运气。下面是更新后的代码。
Document document = new Document();
PdfWriter writer =
PdfWriter.GetInstance(document, new FileStream(savePath, FileMode.Create));
document.Open();
var cssResolver = new StyleAttrCSSResolver();
XMLWorkerFontProvider fontProvider =
new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
fontProvider.Register(@"E:\fonts\NotoSansHebrew-Regular.ttf");
CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
htmlContext.SetImageProvider(new ITextImageHandler());
PdfWriterPipeline pdf = new PdfWriterPipeline(document, writer);
HtmlPipeline html = new HtmlPipeline(htmlContext, pdf);
CssResolverPipeline css = new CssResolverPipeline(cssResolver, html);
XMLWorker worker = new XMLWorker(css, true);
XMLParser p = new XMLParser(worker);
p.Parse(new StringReader(htmlString.ToString()));
document.Close();
下面是 B运行o 的代码与一些实际 HTML 的改编。要 运行 您只需要下载字体 Noto Sans Hebrew 并将其放在您的桌面上。在不进行任何修改(可能文件路径除外)的情况下,尝试 运行ning 这段对我有用的代码。 (我针对 5.5.5 对此进行了测试,因此 5.5.8 应该绝对有效。)
var file = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "test.pdf");
var fontFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "NotoSansHebrew-Regular.ttf");
var htmlText = @"<div dir=""rtl"" style=""font-family: Noto Sans Hebrew;"">שלום עולם</div>";
using (var FS = new System.IO.FileStream(file, FileMode.Create, FileAccess.Write, FileShare.None)) {
using (var document = new Document()) {
using (var writer = PdfWriter.GetInstance(document, FS)) {
document.Open();
var cssResolver = new StyleAttrCSSResolver();
var fontProvider = new XMLWorkerFontProvider(XMLWorkerFontProvider.DONTLOOKFORFONTS);
fontProvider.Register(fontFile);
var cssAppliers = new CssAppliersImpl(fontProvider);
var htmlContext = new HtmlPipelineContext(cssAppliers);
htmlContext.SetTagFactory(Tags.GetHtmlTagProcessorFactory());
var pdf = new PdfWriterPipeline(document, writer);
var html = new HtmlPipeline(htmlContext, pdf);
var css = new CssResolverPipeline(cssResolver, html);
var worker = new XMLWorker(css, true);
var p = new XMLParser(worker);
using (var ms = new System.IO.MemoryStream(System.Text.Encoding.UTF8.GetBytes(htmlText))) {
using (var sr = new StreamReader(ms)) {
p.Parse(sr);
}
}
document.Close();
}
}
}
整个事情的诀窍是在您的 HTML 中获取字体文件中的准确字体名称。有时令人困惑的是,字体实际上可以包含一堆名称。字体越旧,它就越有可能拥有这些。如果我没记错的话,iText 有一些用于确定字体名称的启发式方法,但如果您想安全地使用它,您也可以使用别名并随意调用它。例如,您可以将 HTML 更改为:
var htmlText = @"<div dir=""rtl"" style=""font-family: Gerp;"">שלום עולם</div>";
只要您在注册字体时使用别名,一切都会正常进行:
fontProvider.Register(fontFile, "Gerp");