邮箱到 csv 使用 Python

MailBox to csv using Python

我已经从我的 gmail 帐户下载了邮件存档。我正在使用从博客中获取的以下 python(2.7) 代码将存档的内容转换为 csv。

import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
    writer.writerow([message['subject'], message['from'], message['date']])

我也想包括邮件正文(实际消息)...但不知道如何。我之前没有用过 python,有人可以帮忙吗?我已经使用了给定的其他 SO 选项,但无法通过。

为了完成相同的任务,我也使用了以下代码:但是第 60 行出现缩进错误:return json_msg。我尝试了不同的缩进选项,但没有任何改进。

import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse

MBOX = 'Users/mymachine/client1/Takeout/Mail/archive.mbox'
OUT_FILE = 'Users/mymachine/client1/Takeout/Mail/archive.mbox.json'

def cleanContent(msg):
    msg = quopri.decodestring(msg)
    try:
        soup = BeautifulSoup(msg)
    except:
        return ''
    return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
    def default(self, o): return list(o)

def gen_json_msgs(mb):
    while 1:
        msg = mb.next()
        if msg is None:
            break
            yield jsonifyMessage(msg)

def jsonifyMessage(msg):
    json_msg = {'parts': []}
    for (k, v) in msg.items():
        json_msg[k] = v.decode('utf-8', 'ignore')

    for k in ['To', 'Cc', 'Bcc']:
            if not json_msg.get(k):
                continue
    json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
    .replace(' ', '').decode('utf-8', 'ignore').split(',')

for part in msg.walk():
    json_part = {}
    if part.get_content_maintype() == 'multipart':
        continue


    json_part['contentType'] = part.get_content_type()
    content = part.get_payload(decode=False).decode('utf-8', 'ignore')
    json_part['content'] = cleanContent(content)

    json_msg['parts'].append(json_part)
    then = parse(json_msg['Date'])
    millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
    json_msg['Date'] = {'$date' : millis}

return json_msg

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
    if msg != None:
            f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()

试试这个。

import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
    if message.is_multipart():
        content = ''.join(part.get_payload() for part in message.get_payload())
    else:
        content = message.get_payload()
    writer.writerow([message['subject'], message['from'], message['date'],content])

或者这个:

import mailbox
import csv

def get_message(message):
    if not message.is_multipart():
        return message.get_payload()
    contents = ""
    for msg in message.get_payload():
        contents = contents + str(msg.get_payload()) + '\n'
    return contents

if __name__ == "__main__":

    writer = csv.writer(open("clean_mail.csv", "wb"))
    for message in mailbox.mbox("archive.mbox"):
        contents = get_message(message)
        writer.writerow([message["subject"], message["from"], message["date"],contents])

查找文档 here

多部分内容的 Rahul 代码段的一点改进:

import sys
import mailbox
import csv
from email.header import decode_header

infile = sys.argv[1]
outfile = sys.argv[2]
writer = csv.writer(open(outfile, "w"))


def get_content(part):
    content = ''
    payload = part.get_payload()
    if isinstance(payload, str):
        content += payload
    else:
        for part in payload:
            content += get_content(part)
    return content


writer.writerow(['date', 'from', 'to', 'subject', 'content'])
for index, message in enumerate(mailbox.mbox(infile)):
    content = get_content(message)
    row = [
        message['date'],
        message['from'].strip('>').split('<')[-1],
        message['to'],
        decode_header(message['subject'])[0][0],
        content
    ]
    writer.writerow(row)