Extracting and writing email attachments to xlsx using the MS graph API— Reporting System part 1

  • apache airlflow — for orchestration of the system
  • python — for ETL , (airlfow is also coded with python)
  • GCP storage buckets and Big Query (I did also have airflow running on a VM for practice however its not worth the cost for this project)
  • WeasyPrint — Really great PDF Generator for python
  • Flask (Jinja2) , HTML, CSS — to create the reports (I really enjoyed created pixel perfect reports this way! )
  • Unit Testing
  • CI / CD integration
  • Live dashboard using something like cube.js/ custom react dashboard (https://cube.dev/)
import requests
import json
import base64
import io
import pandas as pd
from xlrd import open_workbook
import os
import logging as log
from airflow.models import Variable
# airflow Variables
APP_ID = Variable.get('app_id')
CLIENT_SECRET = Variable.get('client_secret')
EMAIL = Variable.get('email')
RAW_DATA_LOCATION = Variable.get('raw_data_location')
# Get a token -- https://docs.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-auth-code-flowurl='https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token'data = {
'grant_type': 'client_credentials',
'client_id': APP_ID,
'scope': 'https://graph.microsoft.com/.default',
'client_secret': CLIENT_SECRET
}
r = requests.post(url, data=data)
access_token = r.json().get('access_token')
#Call the graph end point and receive response for messages in mailbox
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'
request_url = graph_endpoint.format(
'/users/' +
EMAIL
+ '/mailFolders/{yourmailfolderid}/messages')
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
response = requests.get(url=request_url, headers=headers)
response.json().keys()
def makeapirequest(endpoint, q_param=None):headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
log.info('Making request to %s...', endpoint)if q_param != None:
response = requests.get(endpoint, headers=headers, params=q_param)
# print(response.url)
else:
response = requests.get(endpoint, headers=headers)
if response.status_code == 200:
json_data = json.loads(response.text)
if '@odata.nextLink' in json_data.keys():
log.info('Paged result returned...')
record = makeapirequest(json_data['@odata.nextLink'], access_token)
entries = len(record['value'])
count = 0
while count < entries:
json_data['value'].append(record['value'][count])
count += 1
return(json_data)
else:
raise Exception('Request failed with ', response.status_code, ' - ',
response.text)
response_text = makeapirequest(request_url)number_of_emails = len(response_text['value'])email_ids = []
for i in range(0, number_of_emails, 1):
email_ids.append(response_text['value'][i]['id'])
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'request_urls = []for email_id in email_ids:
request_urls.append(graph_endpoint.format(
'/users/'
+ EMAIL +
'/messages/'+email_id+'/attachments/'))
#Store the request json in a list 
attach_jsons = []
for i in range(0, number_of_emails, 1):
attach_jsons.append(requests.get(url=request_urls[i], headers=headers))
#Get the id of each attachment from the associated email
attach_ids = []
for i in range(0, len(attach_jsons), 1):
attach_ids.append(attach_jsons[i].json()['value'][0]['id'])
#Get the data from the attachment in byte format and store in dictionary to associate the date of the attachmentcontent_dict = {}
for i in range(0, len(attach_jsons), 1):
#Create a URL to ge the response for the attachment content
attach_url = request_urls[i]+attach_ids[i]
response = requests.get(url=attach_url, headers=headers)
date = response.json()['lastModifiedDateTime'].split('T')[0]
content_dict[date] = response.json()['contentBytes']
directory = RAW_DATA_LOCATION#Remove any existing xlsx files to avoid duplication
filelist = [f for f in os.listdir(directory) if f.endswith(".xlsx")]
for f in filelist:
os.remove(os.path.join(directory, f))
#Write data to xlsx in local directory
for key in content_dict:
coded_string = content_dict[key]
decoded = base64.b64decode(coded_string)
type(decoded) # => <class 'bytes'>
xlsfile = open(directory+str(key)+'.xlsx', 'wb')
xlsfile.write(decoded)
xlsfile.close()
import requests
import json
import base64
import io
import pandas as pd
from xlrd import open_workbook
import os
import logging as log
from airflow.models import Variable
# airflow Variables
APP_ID = Variable.get('app_id')
CLIENT_SECRET = Variable.get('client_secret')
EMAIL = Variable.get('email')
RAW_DATA_LOCATION = Variable.get('raw_data_location')
# Get a token -- https://docs.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-auth-code-flow
url = 'https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token'
data = {
'grant_type': 'client_credentials',
'client_id': APP_ID,
'scope': 'https://graph.microsoft.com/.default',
'client_secret': CLIENT_SECRET
}
r = requests.post(url, data=data)
access_token = r.json().get('access_token')
#Call the graph end point and recieve response for messages in mailbox
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'
request_url = graph_endpoint.format(
'/users/' +
EMAIL
+ '/mailFolders/{yourmailfolderid}/messages')
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
response = requests.get(url=request_url, headers=headers)
response.json().keys()
# This function handles the paginated response recieved from the graph --https://journeyofthegeek.com/2019/01/08/using-python-to-pull-data-from-ms-graph-api-part-2/
def makeapirequest(endpoint, q_param=None):
headers = {
'User-Agent': 'PythonOutlook',
'Authorization': 'Bearer {0}'.format(access_token),
'Accept': 'application/json',
'Content-Type': 'application/json'
}
log.info('Making request to %s...', endpoint)if q_param != None:
response = requests.get(endpoint, headers=headers, params=q_param)
# print(response.url)
else:
response = requests.get(endpoint, headers=headers)
if response.status_code == 200:
json_data = json.loads(response.text)
if '@odata.nextLink' in json_data.keys():
log.info('Paged result returned...')
record = makeapirequest(json_data['@odata.nextLink'], access_token)
entries = len(record['value'])
count = 0
while count < entries:
json_data['value'].append(record['value'][count])
count += 1
return(json_data)
else:
raise Exception('Request failed with ', response.status_code, ' - ',
response.text)
response_text = makeapirequest(request_url)number_of_emails = len(response_text['value'])email_ids = []
for i in range(0, number_of_emails, 1):
email_ids.append(response_text['value'][i]['id'])
graph_endpoint = 'https://graph.microsoft.com/v1.0{}'request_urls = []for email_id in email_ids:
request_urls.append(graph_endpoint.format(
'/users/'
+ EMAIL +
'/messages/'+email_id+'/attachments/'))
attach_jsons = []
for i in range(0, number_of_emails, 1):
attach_jsons.append(requests.get(url=request_urls[i], headers=headers))
attach_ids = []
for i in range(0, len(attach_jsons), 1):
attach_ids.append(attach_jsons[i].json()['value'][0]['id'])
content_dict = {}
for i in range(0, len(attach_jsons), 1):
#Create a URL to ge the response for the attachment content
attach_url = request_urls[i]+attach_ids[i]
response = requests.get(url=attach_url, headers=headers)
date = response.json()['lastModifiedDateTime'].split('T')[0]
content_dict[date] = response.json()['contentBytes']
directory = RAW_DATA_LOCATIONfilelist = [f for f in os.listdir(directory) if f.endswith(".xlsx")]for f in filelist:
os.remove(os.path.join(directory, f))
for key in content_dict:
coded_string = content_dict[key]
decoded = base64.b64decode(coded_string)
type(decoded) # => <class 'bytes'>
xlsfile = open(directory+str(key)+'.xlsx', 'wb')
xlsfile.write(decoded)
xlsfile.close()

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

How I Became a Self-Taught Developer in 6 Month

Documents in a Paperless Office, Find a Needle in the Haystack

March LeetCoding Challenge 2021 — Day 19: Keys and Rooms

Ansible The Best Tool For Blockchain Developers To Deploy

Spring Security important concepts explained

Getting Started with Nexus for z/OS

How to create an Android library

RESTful Web Services Tutorial with Example in Just 10 Minutes

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Alexander

Alexander

More from Medium

Analyzing housing prices

New Feature in Airflow 2.3 — Grid View & Dynamic Task Mapping

Innovation of Data Integration Technology in the Intelligent Era