Extracting and writing email attachments to xlsx using the MS graph API— Reporting System part 1

  • apache airlflow — for orchestration of the system
  • python — for ETL , (airlfow is also coded with python)
  • GCP storage buckets and Big Query (I did also have airflow running on a VM for practice however its not worth the cost for this project)
  • WeasyPrint — Really great PDF Generator for python
  • Flask (Jinja2) , HTML, CSS — to create the reports (I really enjoyed created pixel perfect reports this way! )
  • Unit Testing
  • CI / CD integration
  • Live dashboard using something like cube.js/ custom react dashboard (https://cube.dev/)
import requests
import json
import base64
import io
import pandas as pd
from xlrd import open_workbook
import os
import logging as log
from airflow.models import Variable
# airflow Variables
APP_ID = Variable.get('app_id')
CLIENT_SECRET = Variable.get('client_secret')
EMAIL = Variable.get('email')
RAW_DATA_LOCATION = Variable.get('raw_data_location')
# Get a token -- https://docs.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-auth-code-flowurl='https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token'data = {
'grant_type': 'client_credentials',
'client_id': APP_ID,
'scope': 'https://graph.microsoft.com/.default',
'client_secret': CLIENT_SECRET
}
r = requests.post(url, data=data)
access_token = r.json().get('access_token')
#Call the graph end point and receive response for messages in mailbox
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'
request_url = graph_endpoint.format(
'/users/' +
EMAIL
+ '/mailFolders/{yourmailfolderid}/messages')
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
response = requests.get(url=request_url, headers=headers)
response.json().keys()
def makeapirequest(endpoint, q_param=None):headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
log.info('Making request to %s...', endpoint)if q_param != None:
response = requests.get(endpoint, headers=headers, params=q_param)
# print(response.url)
else:
response = requests.get(endpoint, headers=headers)
if response.status_code == 200:
json_data = json.loads(response.text)
if '@odata.nextLink' in json_data.keys():
log.info('Paged result returned...')
record = makeapirequest(json_data['@odata.nextLink'], access_token)
entries = len(record['value'])
count = 0
while count < entries:
json_data['value'].append(record['value'][count])
count += 1
return(json_data)
else:
raise Exception('Request failed with ', response.status_code, ' - ',
response.text)
response_text = makeapirequest(request_url)number_of_emails = len(response_text['value'])email_ids = []
for i in range(0, number_of_emails, 1):
email_ids.append(response_text['value'][i]['id'])
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'request_urls = []for email_id in email_ids:
request_urls.append(graph_endpoint.format(
'/users/'
+ EMAIL +
'/messages/'+email_id+'/attachments/'))
#Store the request json in a list 
attach_jsons = []
for i in range(0, number_of_emails, 1):
attach_jsons.append(requests.get(url=request_urls[i], headers=headers))
#Get the id of each attachment from the associated email
attach_ids = []
for i in range(0, len(attach_jsons), 1):
attach_ids.append(attach_jsons[i].json()['value'][0]['id'])
#Get the data from the attachment in byte format and store in dictionary to associate the date of the attachmentcontent_dict = {}
for i in range(0, len(attach_jsons), 1):
#Create a URL to ge the response for the attachment content
attach_url = request_urls[i]+attach_ids[i]
response = requests.get(url=attach_url, headers=headers)
date = response.json()['lastModifiedDateTime'].split('T')[0]
content_dict[date] = response.json()['contentBytes']
directory = RAW_DATA_LOCATION#Remove any existing xlsx files to avoid duplication
filelist = [f for f in os.listdir(directory) if f.endswith(".xlsx")]
for f in filelist:
os.remove(os.path.join(directory, f))
#Write data to xlsx in local directory
for key in content_dict:
coded_string = content_dict[key]
decoded = base64.b64decode(coded_string)
type(decoded) # => <class 'bytes'>
xlsfile = open(directory+str(key)+'.xlsx', 'wb')
xlsfile.write(decoded)
xlsfile.close()
import requests
import json
import base64
import io
import pandas as pd
from xlrd import open_workbook
import os
import logging as log
from airflow.models import Variable
# airflow Variables
APP_ID = Variable.get('app_id')
CLIENT_SECRET = Variable.get('client_secret')
EMAIL = Variable.get('email')
RAW_DATA_LOCATION = Variable.get('raw_data_location')
# Get a token -- https://docs.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-auth-code-flow
url = 'https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token'
data = {
'grant_type': 'client_credentials',
'client_id': APP_ID,
'scope': 'https://graph.microsoft.com/.default',
'client_secret': CLIENT_SECRET
}
r = requests.post(url, data=data)
access_token = r.json().get('access_token')
#Call the graph end point and recieve response for messages in mailbox
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'
request_url = graph_endpoint.format(
'/users/' +
EMAIL
+ '/mailFolders/{yourmailfolderid}/messages')
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
response = requests.get(url=request_url, headers=headers)
response.json().keys()
# This function handles the paginated response recieved from the graph --https://journeyofthegeek.com/2019/01/08/using-python-to-pull-data-from-ms-graph-api-part-2/
def makeapirequest(endpoint, q_param=None):
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
log.info('Making request to %s...', endpoint)if q_param != None:
response = requests.get(endpoint, headers=headers, params=q_param)
# print(response.url)
else:
response = requests.get(endpoint, headers=headers)
if response.status_code == 200:
json_data = json.loads(response.text)
if '@odata.nextLink' in json_data.keys():
log.info('Paged result returned...')
record = makeapirequest(json_data['@odata.nextLink'], access_token)
entries = len(record['value'])
count = 0
while count < entries:
json_data['value'].append(record['value'][count])
count += 1
return(json_data)
else:
raise Exception('Request failed with ', response.status_code, ' - ',
response.text)
response_text = makeapirequest(request_url)number_of_emails = len(response_text['value'])email_ids = []
for i in range(0, number_of_emails, 1):
email_ids.append(response_text['value'][i]['id'])
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'request_urls = []for email_id in email_ids:
request_urls.append(graph_endpoint.format(
'/users/'
+ EMAIL +
'/messages/'+email_id+'/attachments/'))
attach_jsons = []
for i in range(0, number_of_emails, 1):
attach_jsons.append(requests.get(url=request_urls[i], headers=headers))
attach_ids = []
for i in range(0, len(attach_jsons), 1):
attach_ids.append(attach_jsons[i].json()['value'][0]['id'])
content_dict = {}
for i in range(0, len(attach_jsons), 1):
#Create a URL to ge the response for the attachment content
attach_url = request_urls[i]+attach_ids[i]
response = requests.get(url=attach_url, headers=headers)
date = response.json()['lastModifiedDateTime'].split('T')[0]
content_dict[date] = response.json()['contentBytes']
directory = RAW_DATA_LOCATIONfilelist = [f for f in os.listdir(directory) if f.endswith(".xlsx")]for f in filelist:
os.remove(os.path.join(directory, f))
for key in content_dict:
coded_string = content_dict[key]
decoded = base64.b64decode(coded_string)
type(decoded) # => <class 'bytes'>
xlsfile = open(directory+str(key)+'.xlsx', 'wb')
xlsfile.write(decoded)
xlsfile.close()

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store