提取 DOCX 注释
Extract DOCX Comments
我是一名教师。我想要一份对我布置的论文发表评论的所有学生的名单,以及他们所说的内容。 Drive API 的东西对我来说太具有挑战性了,但我想我可以将它们下载为 zip 文件并解析 XML.
评论被标记在 w:comment
标签中,w:t
用于评论文本和 .这应该很容易,但是 XML (etree) 把我搞死了。
通过教程(和官方 Python 文档):
z = zipfile.ZipFile('test.docx')
x = z.read('word/comments.xml')
tree = etree.XML(x)
然后我这样做:
children = tree.getiterator()
for c in children:
print(c.attrib)
结果是:
{}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}author': 'Joe Shmoe', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id': '1', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}date': '2017-11-17T16:58:27Z'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidR': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidDel': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidP': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidRDefault': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidRPr': '00000000'}
{}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
在这之后我完全被困住了。我试过 element.get()
和 element.findall()
都没有成功。即使我 copy/paste 值 ('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'
),我在 return.
中得到 None
有人能帮忙吗?
考虑到 OOXML 是一种如此复杂的格式,您已经走得太远了。
下面是一些示例 Python 代码,展示了如何通过 XPath 访问 DOCX 文件的注释:
from lxml import etree
import zipfile
ooXMLns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
def get_comments(docxFileName):
docxZip = zipfile.ZipFile(docxFileName)
commentsXML = docxZip.read('word/comments.xml')
et = etree.XML(commentsXML)
comments = et.xpath('//w:comment',namespaces=ooXMLns)
for c in comments:
# attributes:
print(c.xpath('@w:author',namespaces=ooXMLns))
print(c.xpath('@w:date',namespaces=ooXMLns))
# string value of the comment:
print(c.xpath('string(.)',namespaces=ooXMLns))
我用了Word Object Model to extract comments with replies from a Word document. Documentation on Comments object can be found here。本文档使用 Visual Basic for Applications (VBA)。但是我能够通过稍微修改来使用 Python 中的函数。 Word 对象模型的唯一问题是我必须使用 pywin32 中的 win32com 包,它在 Windows PC 上运行良好,但我不确定它是否可以在 macOS 上运行。
这是我用来提取评论和相关回复的示例代码:
import win32com.client as win32
from win32com.client import constants
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False
filepath = "path\to\file.docx"
def get_comments(filepath):
doc = word.Documents.Open(filepath)
doc.Activate()
activeDoc = word.ActiveDocument
for c in activeDoc.Comments:
if c.Ancestor is None: #checking if this is a top-level comment
print("Comment by: " + c.Author)
print("Comment text: " + c.Range.Text) #text of the comment
print("Regarding: " + c.Scope.Text) #text of the original document where the comment is anchored
if len(c.Replies)> 0: #if the comment has replies
print("Number of replies: " + str(len(c.Replies)))
for r in range(1, len(c.Replies)+1):
print("Reply by: " + c.Replies(r).Author)
print("Reply text: " + c.Replies(r).Range.Text) #text of the reply
doc.Close()
感谢@kjhughes 从文档文件中提取所有评论的惊人答案。我和这个线程中的其他人一样面临同样的问题,以获取评论相关的文本。我以@kjhughes 的代码为基础,并尝试使用 python-docx 来解决这个问题。所以这是我对此的看法。
示例文档。
我将提取文档中引用的注释和段落。
from docx import Document
from lxml import etree
import zipfile
ooXMLns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
#Function to extract all the comments of document(Same as accepted answer)
#Returns a dictionary with comment id as key and comment string as value
def get_document_comments(docxFileName):
comments_dict={}
docxZip = zipfile.ZipFile(docxFileName)
commentsXML = docxZip.read('word/comments.xml')
et = etree.XML(commentsXML)
comments = et.xpath('//w:comment',namespaces=ooXMLns)
for c in comments:
comment=c.xpath('string(.)',namespaces=ooXMLns)
comment_id=c.xpath('@w:id',namespaces=ooXMLns)[0]
comments_dict[comment_id]=comment
return comments_dict
#Function to fetch all the comments in a paragraph
def paragraph_comments(paragraph,comments_dict):
comments=[]
for run in paragraph.runs:
comment_reference=run._r.xpath("./w:commentReference")
if comment_reference:
comment_id=comment_reference[0].xpath('@w:id',namespaces=ooXMLns)[0]
comment=comments_dict[comment_id]
comments.append(comment)
return comments
#Function to fetch all comments with their referenced paragraph
#This will return list like this [{'Paragraph text': [comment 1,comment 2]}]
def comments_with_reference_paragraph(docxFileName):
document = Document(docxFileName)
comments_dict=get_document_comments(docxFileName)
comments_with_their_reference_paragraph=[]
for paragraph in document.paragraphs:
if comments_dict:
comments=paragraph_comments(paragraph,comments_dict)
if comments:
comments_with_their_reference_paragraph.append({paragraph.text: comments})
return comments_with_their_reference_paragraph
if __name__=="__main__":
document="test.docx" #filepath for the input document
print(comments_with_reference_paragraph(document))
示例文档的输出如下所示
我已经在段落级别完成了。这也可以在 python-docx 运行 级别完成。
希望对你有帮助。
我是一名教师。我想要一份对我布置的论文发表评论的所有学生的名单,以及他们所说的内容。 Drive API 的东西对我来说太具有挑战性了,但我想我可以将它们下载为 zip 文件并解析 XML.
评论被标记在 w:comment
标签中,w:t
用于评论文本和 .这应该很容易,但是 XML (etree) 把我搞死了。
通过教程(和官方 Python 文档):
z = zipfile.ZipFile('test.docx')
x = z.read('word/comments.xml')
tree = etree.XML(x)
然后我这样做:
children = tree.getiterator()
for c in children:
print(c.attrib)
结果是:
{}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}author': 'Joe Shmoe', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id': '1', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}date': '2017-11-17T16:58:27Z'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidR': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidDel': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidP': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidRDefault': '00000000', '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsidRPr': '00000000'}
{}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
{'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val': '0'}
在这之后我完全被困住了。我试过 element.get()
和 element.findall()
都没有成功。即使我 copy/paste 值 ('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'
),我在 return.
None
有人能帮忙吗?
考虑到 OOXML 是一种如此复杂的格式,您已经走得太远了。
下面是一些示例 Python 代码,展示了如何通过 XPath 访问 DOCX 文件的注释:
from lxml import etree
import zipfile
ooXMLns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
def get_comments(docxFileName):
docxZip = zipfile.ZipFile(docxFileName)
commentsXML = docxZip.read('word/comments.xml')
et = etree.XML(commentsXML)
comments = et.xpath('//w:comment',namespaces=ooXMLns)
for c in comments:
# attributes:
print(c.xpath('@w:author',namespaces=ooXMLns))
print(c.xpath('@w:date',namespaces=ooXMLns))
# string value of the comment:
print(c.xpath('string(.)',namespaces=ooXMLns))
我用了Word Object Model to extract comments with replies from a Word document. Documentation on Comments object can be found here。本文档使用 Visual Basic for Applications (VBA)。但是我能够通过稍微修改来使用 Python 中的函数。 Word 对象模型的唯一问题是我必须使用 pywin32 中的 win32com 包,它在 Windows PC 上运行良好,但我不确定它是否可以在 macOS 上运行。
这是我用来提取评论和相关回复的示例代码:
import win32com.client as win32
from win32com.client import constants
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False
filepath = "path\to\file.docx"
def get_comments(filepath):
doc = word.Documents.Open(filepath)
doc.Activate()
activeDoc = word.ActiveDocument
for c in activeDoc.Comments:
if c.Ancestor is None: #checking if this is a top-level comment
print("Comment by: " + c.Author)
print("Comment text: " + c.Range.Text) #text of the comment
print("Regarding: " + c.Scope.Text) #text of the original document where the comment is anchored
if len(c.Replies)> 0: #if the comment has replies
print("Number of replies: " + str(len(c.Replies)))
for r in range(1, len(c.Replies)+1):
print("Reply by: " + c.Replies(r).Author)
print("Reply text: " + c.Replies(r).Range.Text) #text of the reply
doc.Close()
感谢@kjhughes 从文档文件中提取所有评论的惊人答案。我和这个线程中的其他人一样面临同样的问题,以获取评论相关的文本。我以@kjhughes 的代码为基础,并尝试使用 python-docx 来解决这个问题。所以这是我对此的看法。
示例文档。
我将提取文档中引用的注释和段落。
from docx import Document
from lxml import etree
import zipfile
ooXMLns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
#Function to extract all the comments of document(Same as accepted answer)
#Returns a dictionary with comment id as key and comment string as value
def get_document_comments(docxFileName):
comments_dict={}
docxZip = zipfile.ZipFile(docxFileName)
commentsXML = docxZip.read('word/comments.xml')
et = etree.XML(commentsXML)
comments = et.xpath('//w:comment',namespaces=ooXMLns)
for c in comments:
comment=c.xpath('string(.)',namespaces=ooXMLns)
comment_id=c.xpath('@w:id',namespaces=ooXMLns)[0]
comments_dict[comment_id]=comment
return comments_dict
#Function to fetch all the comments in a paragraph
def paragraph_comments(paragraph,comments_dict):
comments=[]
for run in paragraph.runs:
comment_reference=run._r.xpath("./w:commentReference")
if comment_reference:
comment_id=comment_reference[0].xpath('@w:id',namespaces=ooXMLns)[0]
comment=comments_dict[comment_id]
comments.append(comment)
return comments
#Function to fetch all comments with their referenced paragraph
#This will return list like this [{'Paragraph text': [comment 1,comment 2]}]
def comments_with_reference_paragraph(docxFileName):
document = Document(docxFileName)
comments_dict=get_document_comments(docxFileName)
comments_with_their_reference_paragraph=[]
for paragraph in document.paragraphs:
if comments_dict:
comments=paragraph_comments(paragraph,comments_dict)
if comments:
comments_with_their_reference_paragraph.append({paragraph.text: comments})
return comments_with_their_reference_paragraph
if __name__=="__main__":
document="test.docx" #filepath for the input document
print(comments_with_reference_paragraph(document))
示例文档的输出如下所示
我已经在段落级别完成了。这也可以在 python-docx 运行 级别完成。 希望对你有帮助。