Sign in

Transforming a pre-formatted excel report into a tabular format without macros — Reporting system part 2

This article follows on from my previous article

In the past when dealing with pre-formatted reports I’ve either had to ask for direct access to the reporting database and when thats not been possible i’ve had to rely on vba/macros or manual intervention to cleanse the report into a format that i could then work with. So I thought this would be a good challenge for my new python skills.

This is the structure of the data that I extracted in the last blog post. I’ve only put the structure of the data and no column names to avoid identification of the supplier.

The first thing I noticed is that there was some hidden files in the folder, now this might have been because I had a file open and a temporary file was created, however I thought I’d write a function to make sure we don’t get anything we dont want in.

# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
import re
import time
from airflow.models import VariableRAW_DATA_LOCATION = Variable.get('raw_data_location')
STAGING_DATA_LOCATION = Variable.get('staging_data_location')
# Set Pandas to display 500 rows
pd.set_3('display.max_rows', 500)
# Set Folder to use
def cleanfiles(folder):
#List files in folder
folderfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
files = []
#Remove hidden files
for file in folderfiles:
if not file.startswith('~$'):
#Return only excel files
files = [file for file in files if file.endswith('xlsx')]
return files

The next step is to create a function that will clean each individual file and then concatenate them into one dataframe

def cleandata(folder):
df_list = []
for file in cleanfiles(folder): #Read in excel files
df = pd.read_excel(folder+'/'+file, skiprows=9)
#Drop empty column
df = df.drop('Unnamed: 0',axis=1)
# Drop all empty rows
df = df.dropna(how='all',axis=0)
# Fill NAN cells with 0
df['18'].fillna(0, inplace=True)
df['19'].fillna(0, inplace=True)
# Drop any remaining rows with NAN values in any cell
df = df.dropna(how='any', axis=0)
# Drop duplicate rows ( This eliminates the duplicate headers from the separate files)
df = df.drop_duplicates(keep=False)
#Add date in new column
date_time_str = file.split('.')[0]
filedate = datetime.datetime.strptime(date_time_str, '%Y-%m-%d')
df['date'] = filedate
# Rename and change types
df['2'] = df['2'].astype('int64')
df['3'] = df['3'].astype('int64')
df['4'] = df['4'].astype('int64')
df['7'] = df['7'].astype('float')
df['8'] = df['8'].astype('float')
df['9'] = df['9'].astype('int64')
df['10'] = df['10'].astype('float')
df['11'] = df['11'].astype('int64')
df['12'] = df['12'].astype('int64')
df['13'] = df['13'].astype('int64')
df['14'] = df['14'].astype('int64')
df['15'] = df['15'].astype('int64')
df['16'] = df['16'].astype('int64')
df['17'] = df['17'].astype('int64')
df['18'] = df['18'].astype('int64')
df['19'] = df['19'].astype('int64')
df.rename(columns =
#Append to list of data frames
#Concatenate newly clean dataframes into one large dataframe
df = pd.concat(df_list)
return df

The above code cleans the data up so that each table of data for each category in the single report appends into one table and empty cells are filled along with data types are assigned.

We now however need to somehow associate each row in the data frame with the associated category which was above each table in the original report

There is one column which is a sub category in the table which we will use to create a lookup table and then join it on to the main table in the next step

def category_df(folder):
category_df_list = []
def hasNumbers(inputString):
return bool('\d', inputString))
def isDept(x):
# 1 refers to column 1 which is the sub-category
if x =='1':
return True

The above code assigns a column that dertmines whether the column string contains a numeric value or not. The categories above the tables all contained numbers and I used this to help me create the category column. As the category column itself fell directly on top of the subcategory column.

def category_df(folder):
category_df_list = []
def hasNumbers(inputString):
return bool('\d', inputString))
def isDept(x):
if x =='1':
return True
for file in cleanfiles(folder):
#Read in excel files
categorydf = pd.read_excel(folder+'/'+file, skiprows=6)
categorydf = categorydf['Unnamed: 1'].reset_index()
categorydf = categorydf.fillna(method='ffill')
categorydf.drop('index',axis=1, inplace=True)
#Add column to determine if column has integer in string
categorydf['bool'] = categorydf['Unnamed: 1'].map(lambda x: hasNumbers(x))
#If the column contains an integer use this value as the column value for the category column
categorydf['Category'] = categorydf.apply(lambda x: x['Unnamed: 1'] if x['bool'] ==True else None, axis=1 )
#forward fill categories to associate the sub category with the category
categorydf.fillna(method='ffill', inplace=True)
#Create boolean column to decide if the column is the column header
categorydf['isDept?'] = categorydf['Unnamed: 1'].map(lambda x: isDept(x))
#Filter out any rows with the sub-category header
mask = (categorydf['bool'] ==False) & (categorydf['isDept?'] != True)
categorydf = categorydf[mask]
categorydf = categorydf.copy()
categorydf.rename(columns={'Unnamed: 1': '1'}, inplace=True)
categorydf['Category'] = categorydf['Category'].map(lambda x: re.sub(r'\d+', '', x))
categorydf['Category'] = categorydf['Category'].map(lambda x: x[1:])
categorydf.drop(['bool','isDept?'], axis=1,inplace=True)
category_df = pd.concat(category_df_list)
#Final result with department names and categories side by side as a look up table category_df = category_df[['Category','1']].sort_values(by='Category', ascending=True).reset_index().drop('index',axis=1)
return category_df
df = cleandata(folder)df.to_csv(STAGING_DATA_LOCATION+'sales_stage.csv')categorydf = category_df(folder)

The end result is two files one with all reports cleaned into one large data frame and a look up table to match sub-category to major category

We had to derive the sub-category and category this way, as we were not able to manually create a lookup table due to the fact that we do not control any new sub-categories/categories being added

The below code then takes these staging tables and creates a star schema with some additional meta data for the image

# Import libraries
import pandas as pd
import datetime
from os import listdir
from os.path import isfile, join
from airflow.models import Variable
def dim_status(df):
current_status = df['6'].drop_duplicates().reset_index().drop('index',axis=1)
current_status['Status_SK'] = range(1, len(current_status.index)+1)
current_status = current_status[['Status_SK','6']]
return current_status
def to_integer(dt_time):
return 10000*dt_time.year + 100*dt_time.month +
def create_date_table(df):
start = df['date'].min()
end = df['date'].max()
df = pd.DataFrame({"Date": pd.date_range(start, end)})
df['Date_SK'] = df['Date'].map(lambda x: to_integer(x))
df["Day"] = df.Date.dt.day_name
df["Week"] = df.Date.dt.weekofyear
df["Quarter"] = df.Date.dt.quarter
df["Year"] = df.Date.dt.year
df["Year_half"] = (df.Quarter + 1) // 2
date_table = df
cols = ['Date_SK','Date','Day','Week','Quarter','Year','Year_half']
date_table = date_table[cols]
return date_table
def dim_item(df):
mask = ['5','1','4','2','3']
df2 = df[mask].drop_duplicates().sort_values(by='5')
df2['Item_SK'] = range(1, len(df2.index)+1)
Item = df2[['Item_SK','5','1','4','2','3']]
Item = Item.merge(category_df,left_on='1', right_on='1', how='left')
Item_codes = Item['3']
Item_urls = []
for code in Item_codes:
URL = '{}'.format(code)
Image_urls = []
for code in Item_codes:
URL = '{}/'.format(code)
Item['ItemURL'] = Item_urls
Item['ImageURL'] = Image_urls
return Itemdef sales(df):
df = df.merge(Status, left_on='6',right_on='6', how='left').drop('6', axis=1)
cols_to_use = Item.columns.difference(df.columns)
columns = ['2','3','5','4','1']
sales = df.merge(Item, left_on=columns, right_on=columns, how='left').drop(columns,axis=1)
sales.rename(columns={'date':'Date'}, inplace=True)
sales['Date']= pd.to_datetime(sales['Date'])
Date_table = Date[['Date_SK','Date']]
sales = sales.merge(Date_table,left_on='Date',right_on='Date')
cols = ['Item_SK','Status_SK','Date_SK','7', '8', '9',
'10', '11', '9-1', '9-2',
'9-3', '',
'16', '17', '18',
'19' ]
sales = sales[cols]
return salesSTAGING_DATA_LOCATION = Variable.get('staging_data_location')
STAR_SCHEMA_LOCATION = Variable.get('star_schema_location')
staging_df = pd.read_csv(STAGING_DATA_LOCATION+'sales_stage.csv')
category_df = pd.read_csv(STAGING_DATA_LOCATION+'category_stage.csv')
Status = dim_status(staging_df)
Item = dim_item(staging_df)
Date = create_date_table(staging_df)
Sales = sales(staging_df)
table_dict = {'Status':Status,'Item':Item,'Date':Date,'Sales':Sales}for name,table in table_dict.items():

The following article will go over the generation of the reports