Transforming a pre-formatted excel report into a tabular format without macros — Reporting system part 2

5 min readJan 31, 2021

** I have set a new consultancy www.futureanalytics.io please visit for any data analytics, app development or training needs you have

This article follows on from my previous article https://alexanderamlani24.medium.com/extracting-and-writing-email-attachments-to-xlsx-using-the-ms-graph-api-reporting-system-part-1-82819256b630

In the past when dealing with pre-formatted reports I’ve either had to ask for direct access to the reporting database and when thats not been possible i’ve had to rely on vba/macros or manual intervention to cleanse the report into a format that i could then work with. So I thought this would be a good challenge for my new python skills.

This is the structure of the data that I extracted in the last blog post. I’ve only put the structure of the data and no column names to avoid identification of the supplier.

The first thing I noticed is that there was some hidden files in the folder, now this might have been because I had a file open and a temporary file was created, however I thought I’d write a function to make sure we don’t get anything we dont want in.

# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
import re
import timefrom airflow.models import VariableRAW_DATA_LOCATION = Variable.get('raw_data_location')
STAGING_DATA_LOCATION = Variable.get('staging_data_location')# Set Pandas to display 500 rows
pd.set_3('display.max_rows', 500)# Set Folder to use
folder = RAW_DATA_LOCATIONdef cleanfiles(folder):
    #List files in folder
    folderfiles = [f for f in listdir(folder) if isfile(join(folder,    f))]
    files = []
    #Remove hidden files
    for file in folderfiles:
        if not file.startswith('~$'):
            files.append(file)
    #Return only excel files
    files = [file for file in files if file.endswith('xlsx')]
    return files

The next step is to create a function that will clean each individual file and then concatenate them into one dataframe

def cleandata(folder):
    df_list = []  for file in cleanfiles(folder):   #Read in excel files
        df = pd.read_excel(folder+'/'+file, skiprows=9)        #Drop empty column
        df = df.drop('Unnamed: 0',axis=1)        # Drop all empty rows
        df = df.dropna(how='all',axis=0)        # Fill NAN cells with 0
        df['18'].fillna(0, inplace=True)
        df['19'].fillna(0, inplace=True)        # Drop any remaining rows with NAN values in any cell
        df = df.dropna(how='any', axis=0)        # Drop duplicate rows ( This eliminates the duplicate headers from the separate files)
        df = df.drop_duplicates(keep=False)        #Add date in new column
        date_time_str = file.split('.')[0]
        filedate = datetime.datetime.strptime(date_time_str, '%Y-%m-%d')
        df['date'] = filedate        # Rename and change types
        df['2'] = df['2'].astype('int64')
        df['3'] = df['3'].astype('int64')
        df['4'] = df['4'].astype('int64')
        df['7'] = df['7'].astype('float')
        df['8'] = df['8'].astype('float')
        df['9'] = df['9'].astype('int64')
        df['10'] = df['10'].astype('float')
        df['11'] = df['11'].astype('int64')
        df['12'] = df['12'].astype('int64')
        df['13'] = df['13'].astype('int64')
        df['14'] = df['14'].astype('int64')
        df['15'] = df['15'].astype('int64')
        df['16'] = df['16'].astype('int64')
        df['17'] = df['17'].astype('int64')
        df['18'] = df['18'].astype('int64')
        df['19'] = df['19'].astype('int64')
        df.rename(columns =
          {'12':'9-1'
          ,'13':'9-2'
          ,'14':'9-3'
          },inplace=True)        #Append to list of data frames
        df_list.append(df) #Concatenate newly clean dataframes into one large dataframe
 df = pd.concat(df_list)
    return df

The above code cleans the data up so that each table of data for each category in the single report appends into one table and empty cells are filled along with data types are assigned.

We now however need to somehow associate each row in the data frame with the associated category which was above each table in the original report

There is one column which is a sub category in the table which we will use to create a lookup table and then join it on to the main table in the next step

def category_df(folder):
    category_df_list = []    def hasNumbers(inputString):
                return bool(re.search(r'\d', inputString))    def isDept(x):
    # 1 refers to column 1 which is the sub-category
       if x =='1':
          return True

The above code assigns a column that dertmines whether the column string contains a numeric value or not. The categories above the tables all contained numbers and I used this to help me create the category column. As the category column itself fell directly on top of the subcategory column.

def category_df(folder):
    category_df_list = []def hasNumbers(inputString):
                return bool(re.search(r'\d', inputString))def isDept(x):
        if x =='1':
            return Truefor file in cleanfiles(folder):
        #Read in excel files
        categorydf = pd.read_excel(folder+'/'+file, skiprows=6)
        categorydf = categorydf['Unnamed: 1'].reset_index()
        categorydf = categorydf.fillna(method='ffill')
        categorydf.drop('index',axis=1, inplace=True)        #Add column to determine if column has integer in string
        categorydf['bool'] = categorydf['Unnamed: 1'].map(lambda x:           hasNumbers(x))        #If the column contains an integer use this value as the column value for the category column 
        categorydf['Category'] = categorydf.apply(lambda x: x['Unnamed: 1'] if x['bool'] ==True else None, axis=1 )#forward fill categories to associate the sub category with the category 
        categorydf.fillna(method='ffill', inplace=True)#Create boolean column to decide if the column is the column header
        categorydf['isDept?'] = categorydf['Unnamed: 1'].map(lambda x: isDept(x))
        categorydf.drop_duplicates(inplace=True)#Filter out any rows with the sub-category header 
        mask = (categorydf['bool'] ==False) & (categorydf['isDept?'] != True)
        categorydf = categorydf[mask]
        categorydf = categorydf.copy()
        categorydf.rename(columns={'Unnamed: 1': '1'}, inplace=True)
        categorydf['Category'] = categorydf['Category'].map(lambda x: re.sub(r'\d+', '', x))
        categorydf['Category'] = categorydf['Category'].map(lambda x: x[1:])
        categorydf.drop(['bool','isDept?'], axis=1,inplace=True)
        category_df_list.append(categorydf)category_df = pd.concat(category_df_list)
    category_df.drop_duplicates(keep='first',inplace=True)#Final result with department names and categories side by side as a look up table    category_df = category_df[['Category','1']].sort_values(by='Category', ascending=True).reset_index().drop('index',axis=1)
    return category_dfdf = cleandata(folder)df.to_csv(STAGING_DATA_LOCATION+'sales_stage.csv')categorydf = category_df(folder)
categorydf.to_csv(STAGING_DATA_LOCATION+'category_stage.csv')

The end result is two files one with all reports cleaned into one large data frame and a look up table to match sub-category to major category

We had to derive the sub-category and category this way, as we were not able to manually create a lookup table due to the fact that we do not control any new sub-categories/categories being added

The below code then takes these staging tables and creates a star schema with some additional meta data for the image

# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
from airflow.models import Variabledef dim_status(df):
    current_status = df['6'].drop_duplicates().reset_index().drop('index',axis=1)
    current_status['Status_SK'] =  range(1, len(current_status.index)+1)
    current_status = current_status[['Status_SK','6']]
    return current_statusdef to_integer(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.daydef create_date_table(df):
    start = df['date'].min()
    end = df['date'].max()
    df = pd.DataFrame({"Date": pd.date_range(start, end)})
    df['Date_SK'] = df['Date'].map(lambda x: to_integer(x))
    df["Day"] = df.Date.dt.day_name
    df["Week"] = df.Date.dt.weekofyear
    df["Quarter"] = df.Date.dt.quarter
    df["Year"] = df.Date.dt.year
    df["Year_half"] = (df.Quarter + 1) // 2
    date_table = df
    cols = ['Date_SK','Date','Day','Week','Quarter','Year','Year_half']
    date_table = date_table[cols]
    return date_tabledef dim_item(df):
    mask = ['5','1','4','2','3']
    df2 = df[mask].drop_duplicates().sort_values(by='5')
    df2['Item_SK'] = range(1, len(df2.index)+1)
    Item = df2[['Item_SK','5','1','4','2','3']]
    Item = Item.merge(category_df,left_on='1', right_on='1', how='left')
    Item_codes = Item['3']Item_urls = []
    for code in Item_codes:
        URL = 'https://www.company.com/uk/p/{}'.format(code)
        Item_urls.append(URL)Image_urls = []
    for code in Item_codes:
        URL = 'https://media2.companyassets.com/i/company/{}/'.format(code)
        Image_urls.append(URL)Item['ItemURL'] = Item_urls
    Item['ImageURL'] = Image_urlsreturn Itemdef sales(df):
    df = df.merge(Status, left_on='6',right_on='6', how='left').drop('6', axis=1)
    cols_to_use = Item.columns.difference(df.columns)
    Item[cols_to_use]
    columns = ['2','3','5','4','1']
    sales = df.merge(Item, left_on=columns, right_on=columns, how='left').drop(columns,axis=1)
    sales.rename(columns={'date':'Date'}, inplace=True)
    sales['Date']= pd.to_datetime(sales['Date'])
    Date_table = Date[['Date_SK','Date']]
    sales = sales.merge(Date_table,left_on='Date',right_on='Date')
    cols = ['Item_SK','Status_SK','Date_SK','7', '8', '9',
           '10', '11', '9-1', '9-2',
           '9-3', '',
           '16', '17', '18',
           '19' ]
    sales = sales[cols]return salesSTAGING_DATA_LOCATION = Variable.get('staging_data_location')
STAR_SCHEMA_LOCATION = Variable.get('star_schema_location')staging_df = pd.read_csv(STAGING_DATA_LOCATION+'sales_stage.csv')
category_df = pd.read_csv(STAGING_DATA_LOCATION+'category_stage.csv')Status = dim_status(staging_df)
Item = dim_item(staging_df)
Date = create_date_table(staging_df)
Sales = sales(staging_df)table_dict = {'Status':Status,'Item':Item,'Date':Date,'Sales':Sales}for name,table in table_dict.items():
    table.to_csv(STAR_SCHEMA_LOCATION+name+'.csv')

The following article will go over the generation of the reports

Transforming a pre-formatted excel report into a tabular format without macros — Reporting system part 2

Written by Alexander