Transforming a pre-formatted excel report into a tabular format without macros — Reporting system part 2
** I have set a new consultancy www.futureanalytics.io please visit for any data analytics, app development or training needs you have
This article follows on from my previous article https://alexanderamlani24.medium.com/extracting-and-writing-email-attachments-to-xlsx-using-the-ms-graph-api-reporting-system-part-1-82819256b630
In the past when dealing with pre-formatted reports I’ve either had to ask for direct access to the reporting database and when thats not been possible i’ve had to rely on vba/macros or manual intervention to cleanse the report into a format that i could then work with. So I thought this would be a good challenge for my new python skills.
This is the structure of the data that I extracted in the last blog post. I’ve only put the structure of the data and no column names to avoid identification of the supplier.
The first thing I noticed is that there was some hidden files in the folder, now this might have been because I had a file open and a temporary file was created, however I thought I’d write a function to make sure we don’t get anything we dont want in.
# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
import re
import timefrom airflow.models import VariableRAW_DATA_LOCATION = Variable.get('raw_data_location')
STAGING_DATA_LOCATION = Variable.get('staging_data_location')# Set Pandas to display 500 rows
pd.set_3('display.max_rows', 500)# Set Folder to use
folder = RAW_DATA_LOCATIONdef cleanfiles(folder):
#List files in folder
folderfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
files = []
#Remove hidden files
for file in folderfiles:
if not file.startswith('~$'):
files.append(file)
#Return only excel files
files = [file for file in files if file.endswith('xlsx')]
return files
The next step is to create a function that will clean each individual file and then concatenate them into one dataframe
def cleandata(folder):
df_list = [] for file in cleanfiles(folder): #Read in excel files
df = pd.read_excel(folder+'/'+file, skiprows=9) #Drop empty column
df = df.drop('Unnamed: 0',axis=1) # Drop all empty rows
df = df.dropna(how='all',axis=0) # Fill NAN cells with 0
df['18'].fillna(0, inplace=True)
df['19'].fillna(0, inplace=True) # Drop any remaining rows with NAN values in any cell
df = df.dropna(how='any', axis=0) # Drop duplicate rows ( This eliminates the duplicate headers from the separate files)
df = df.drop_duplicates(keep=False) #Add date in new column
date_time_str = file.split('.')[0]
filedate = datetime.datetime.strptime(date_time_str, '%Y-%m-%d')
df['date'] = filedate # Rename and change types
df['2'] = df['2'].astype('int64')
df['3'] = df['3'].astype('int64')
df['4'] = df['4'].astype('int64')
df['7'] = df['7'].astype('float')
df['8'] = df['8'].astype('float')
df['9'] = df['9'].astype('int64')
df['10'] = df['10'].astype('float')
df['11'] = df['11'].astype('int64')
df['12'] = df['12'].astype('int64')
df['13'] = df['13'].astype('int64')
df['14'] = df['14'].astype('int64')
df['15'] = df['15'].astype('int64')
df['16'] = df['16'].astype('int64')
df['17'] = df['17'].astype('int64')
df['18'] = df['18'].astype('int64')
df['19'] = df['19'].astype('int64')
df.rename(columns =
{'12':'9-1'
,'13':'9-2'
,'14':'9-3'
},inplace=True) #Append to list of data frames
df_list.append(df) #Concatenate newly clean dataframes into one large dataframe
df = pd.concat(df_list)
return df
The above code cleans the data up so that each table of data for each category in the single report appends into one table and empty cells are filled along with data types are assigned.
We now however need to somehow associate each row in the data frame with the associated category which was above each table in the original report
There is one column which is a sub category in the table which we will use to create a lookup table and then join it on to the main table in the next step
def category_df(folder):
category_df_list = [] def hasNumbers(inputString):
return bool(re.search(r'\d', inputString)) def isDept(x):
# 1 refers to column 1 which is the sub-category
if x =='1':
return True
The above code assigns a column that dertmines whether the column string contains a numeric value or not. The categories above the tables all contained numbers and I used this to help me create the category column. As the category column itself fell directly on top of the subcategory column.
def category_df(folder):
category_df_list = []def hasNumbers(inputString):
return bool(re.search(r'\d', inputString))def isDept(x):
if x =='1':
return Truefor file in cleanfiles(folder):
#Read in excel files
categorydf = pd.read_excel(folder+'/'+file, skiprows=6)
categorydf = categorydf['Unnamed: 1'].reset_index()
categorydf = categorydf.fillna(method='ffill')
categorydf.drop('index',axis=1, inplace=True) #Add column to determine if column has integer in string
categorydf['bool'] = categorydf['Unnamed: 1'].map(lambda x: hasNumbers(x)) #If the column contains an integer use this value as the column value for the category column
categorydf['Category'] = categorydf.apply(lambda x: x['Unnamed: 1'] if x['bool'] ==True else None, axis=1 )#forward fill categories to associate the sub category with the category
categorydf.fillna(method='ffill', inplace=True)#Create boolean column to decide if the column is the column header
categorydf['isDept?'] = categorydf['Unnamed: 1'].map(lambda x: isDept(x))
categorydf.drop_duplicates(inplace=True)#Filter out any rows with the sub-category header
mask = (categorydf['bool'] ==False) & (categorydf['isDept?'] != True)
categorydf = categorydf[mask]
categorydf = categorydf.copy()
categorydf.rename(columns={'Unnamed: 1': '1'}, inplace=True)
categorydf['Category'] = categorydf['Category'].map(lambda x: re.sub(r'\d+', '', x))
categorydf['Category'] = categorydf['Category'].map(lambda x: x[1:])
categorydf.drop(['bool','isDept?'], axis=1,inplace=True)
category_df_list.append(categorydf)category_df = pd.concat(category_df_list)
category_df.drop_duplicates(keep='first',inplace=True)#Final result with department names and categories side by side as a look up table category_df = category_df[['Category','1']].sort_values(by='Category', ascending=True).reset_index().drop('index',axis=1)
return category_dfdf = cleandata(folder)df.to_csv(STAGING_DATA_LOCATION+'sales_stage.csv')categorydf = category_df(folder)
categorydf.to_csv(STAGING_DATA_LOCATION+'category_stage.csv')
The end result is two files one with all reports cleaned into one large data frame and a look up table to match sub-category to major category
We had to derive the sub-category and category this way, as we were not able to manually create a lookup table due to the fact that we do not control any new sub-categories/categories being added
The below code then takes these staging tables and creates a star schema with some additional meta data for the image
# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
from airflow.models import Variabledef dim_status(df):
current_status = df['6'].drop_duplicates().reset_index().drop('index',axis=1)
current_status['Status_SK'] = range(1, len(current_status.index)+1)
current_status = current_status[['Status_SK','6']]
return current_statusdef to_integer(dt_time):
return 10000*dt_time.year + 100*dt_time.month + dt_time.daydef create_date_table(df):
start = df['date'].min()
end = df['date'].max()
df = pd.DataFrame({"Date": pd.date_range(start, end)})
df['Date_SK'] = df['Date'].map(lambda x: to_integer(x))
df["Day"] = df.Date.dt.day_name
df["Week"] = df.Date.dt.weekofyear
df["Quarter"] = df.Date.dt.quarter
df["Year"] = df.Date.dt.year
df["Year_half"] = (df.Quarter + 1) // 2
date_table = df
cols = ['Date_SK','Date','Day','Week','Quarter','Year','Year_half']
date_table = date_table[cols]
return date_tabledef dim_item(df):
mask = ['5','1','4','2','3']
df2 = df[mask].drop_duplicates().sort_values(by='5')
df2['Item_SK'] = range(1, len(df2.index)+1)
Item = df2[['Item_SK','5','1','4','2','3']]
Item = Item.merge(category_df,left_on='1', right_on='1', how='left')
Item_codes = Item['3']Item_urls = []
for code in Item_codes:
URL = 'https://www.company.com/uk/p/{}'.format(code)
Item_urls.append(URL)Image_urls = []
for code in Item_codes:
URL = 'https://media2.companyassets.com/i/company/{}/'.format(code)
Image_urls.append(URL)Item['ItemURL'] = Item_urls
Item['ImageURL'] = Image_urlsreturn Itemdef sales(df):
df = df.merge(Status, left_on='6',right_on='6', how='left').drop('6', axis=1)
cols_to_use = Item.columns.difference(df.columns)
Item[cols_to_use]
columns = ['2','3','5','4','1']
sales = df.merge(Item, left_on=columns, right_on=columns, how='left').drop(columns,axis=1)
sales.rename(columns={'date':'Date'}, inplace=True)
sales['Date']= pd.to_datetime(sales['Date'])
Date_table = Date[['Date_SK','Date']]
sales = sales.merge(Date_table,left_on='Date',right_on='Date')
cols = ['Item_SK','Status_SK','Date_SK','7', '8', '9',
'10', '11', '9-1', '9-2',
'9-3', '',
'16', '17', '18',
'19' ]
sales = sales[cols]return salesSTAGING_DATA_LOCATION = Variable.get('staging_data_location')
STAR_SCHEMA_LOCATION = Variable.get('star_schema_location')staging_df = pd.read_csv(STAGING_DATA_LOCATION+'sales_stage.csv')
category_df = pd.read_csv(STAGING_DATA_LOCATION+'category_stage.csv')Status = dim_status(staging_df)
Item = dim_item(staging_df)
Date = create_date_table(staging_df)
Sales = sales(staging_df)table_dict = {'Status':Status,'Item':Item,'Date':Date,'Sales':Sales}for name,table in table_dict.items():
table.to_csv(STAR_SCHEMA_LOCATION+name+'.csv')
The following article will go over the generation of the reports