|
| 1 | +"""Count the number of unique message threads in a Gmail inbox, by logging |
| 2 | +in with IMAP and checking the count of unique values of the X-GM-THRID header. |
| 3 | +
|
| 4 | +This example was helpful: |
| 5 | +http://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/ |
| 6 | +
|
| 7 | +""" |
| 8 | +import imaplib |
| 9 | +import re |
| 10 | +from collections import defaultdict |
| 11 | +from dateutil.parser import parser |
| 12 | +from operator import itemgetter |
| 13 | + |
| 14 | +# messages fetched with query '(ENVELOPE X-GM-THRID)' look like: |
| 15 | +# 145 (X-GM-THRID 1490479738467641057 UID 63980 ENVELOPE ("Fri, 16 Jan 2015 18:17:21 +0000 (UTC)" "Mark, please add me to your LinkedIn network" (("Terry Kim" NIL "member" "linkedin.com")) (("Terry Kim" NIL "member" "linkedin.com")) ((NIL NIL "terry" "yelp.com")) (("Mark Wilson" NIL "mark.wilson" "aya.yale.edu")) NIL NIL NIL "<979753161.7821475.1421432241882.JavaMail.app@lva1-app1733.prod>")) |
| 16 | + |
| 17 | +thread_id_re = re.compile('X-GM-THRID (\d+)') |
| 18 | +date_re = re.compile('ENVELOPE \("([^"]+)"') |
| 19 | +subject_re = re.compile('ENVELOPE \("[^"]+" "([^"]*)"') |
| 20 | +from_re = re.compile('ENVELOPE \("[^"]+" "[^"]*" \(\("([^"]*)" NIL "([^"]*)" "([^"]*)"') |
| 21 | + |
| 22 | +date_parser = parser() |
| 23 | + |
| 24 | +def re_partial(regex): |
| 25 | + return lambda envelope: regex.search(envelope).group(1) |
| 26 | + |
| 27 | +def date_str_to_timestamp(date_str): |
| 28 | + return int(date_parser.parse(date_str).strftime('%s')) |
| 29 | + |
| 30 | +def get_from(envelope): |
| 31 | + result = from_re.search(envelope) |
| 32 | + _, name, email1, email2 = [result.group(i) for i in range(4)] |
| 33 | + return [name, '%s@%s' % (email1, email2)] |
| 34 | + |
| 35 | +def gmail_thread_info(email, password): |
| 36 | + mail = imaplib.IMAP4_SSL('imap.gmail.com') |
| 37 | + mail.login(email, password) |
| 38 | + |
| 39 | + mail.select('INBOX') |
| 40 | + |
| 41 | + _, (uid_list,) = mail.uid('search', None, 'ALL') |
| 42 | + uids = uid_list.split(' ') |
| 43 | + |
| 44 | + _, inbox = mail.uid('fetch', ','.join(uids), '(ENVELOPE X-GM-THRID)') |
| 45 | + |
| 46 | + # Group messages into Gmail threads |
| 47 | + thread_id_to_messages = defaultdict(list) |
| 48 | + for m in inbox: |
| 49 | + thread_id_to_messages[re_partial(thread_id_re)(m)] += [m] |
| 50 | + |
| 51 | + # Pick the one whose subject doesn't start with "Re:" |
| 52 | + thread_id_to_single_message = {} |
| 53 | + for thread_id, messages in thread_id_to_messages.iteritems(): |
| 54 | + if len(messages) == 1: |
| 55 | + thread_id_to_single_message[thread_id] = messages[0] |
| 56 | + else: |
| 57 | + subjects_without_reply = [ m for m in messages if not re_partial(subject_re)(m).startswith('Re: ') ] |
| 58 | + if subjects_without_reply: |
| 59 | + thread_id_to_single_message[thread_id] = subjects_without_reply[0] |
| 60 | + else: |
| 61 | + thread_id_to_single_message[thread_id] = messages[0] |
| 62 | + |
| 63 | + out = [] |
| 64 | + |
| 65 | + # Pull out the important stuff |
| 66 | + for message in thread_id_to_single_message.itervalues(): |
| 67 | + date = re_partial(date_re)(message) |
| 68 | + out.append({ |
| 69 | + 'thread_id': re_partial(thread_id_re)(message), |
| 70 | + 'date': date, |
| 71 | + 'date_ts': date_str_to_timestamp(date), |
| 72 | + 'subject': re_partial(subject_re)(message), |
| 73 | + 'from': get_from(message), |
| 74 | + }) |
| 75 | + |
| 76 | + # Sort by timestamp |
| 77 | + out = list(sorted(out, key=itemgetter('date_ts'), reverse=True)) |
| 78 | + |
| 79 | + return out |
0 commit comments