To better demonstrate what I'm going for, here's the Python version of the program: import pandas as pd
import numpy as np
import os
import string
import datetime as dt
def time_processor(df, start_col, end_col, key_col):
df_1 = df[[key_col, start_col, end_col]]
# Use sort_values to sort data and the groupby function to generate a .groupby() object
groupob = df_1.sort_values(by = list(df_1.columns)).groupby(key_col)
# Shift rows down by groups. This method will always generate one null row per group
df_1[f"Lag {start_col}"] = groupob.shift()[start_col]
df_1[f"Lag {end_col}"] = groupob.shift()[end_col]
# Null values are dropped to ensure they don't interfere with calculations
df_2 = df_1.dropna().sort_values(by = [key_col, start_col, end_col])
# Lambda function returns start_date if start_date is earlier than lag_start_date
df_2[f"True {start_col}"] = df_2.apply(lambda x: x[start_col] if x[start_col] < x[f"Lag {start_col}"] else x[f"Lag {start_col}"], axis = 1)
# Lambda function returns lag_end_date if lag_end_date is earlier than start_date
df_2[f"True {end_col}"] = df_2.apply(lambda x: x[f"Lag {end_col}"] if x[f"Lag {end_col}"] < x[start_col] else x[start_col], axis = 1)
# The following for loop creates a table for each unique value in the customer_id column, changes the final value
# of the final_end_date column into the latest date of the delinquency_end_date column and appends it to a list
# for reassembly
new_list = []
for i in df_2[key_col].unique():
df_3 = df_2[df_2[key_col] == i].reset_index(drop = True)
df_3.loc[:, f"True {end_col}"][len(df_3) - 1] = df_3[end_col].max()
new_list.append(df_3)
return pd.concat(new_list)
def validation_func(df, start_col, end_col, key_col):
# This function is based on the idea that if the previous function had worked correctly,
# dates would be changing in a monotonic fashion.
valid_dict = {
"Start Count" : [],
"End Count" : [],
"Count" : []
}
# The following nested loop compares an observation to the observation that comes after it to
# confirm that dates are indeed changing monotonically. Start_num and end_num are
# incremented by 1 for every observation that satisfies the above condition.
for i in df[key_col].unique():
df_1 = df[df[key_col] == i]
start_list = list(df_1[start_col])
end_list = list(df_1[end_col])
start_num, end_num, count_num = 0, 0, 0
while True:
if start_list[count_num] <= start_list[count_num + 1]:
start_num += 1
if end_list[count_num] <= end_list[count_num + 1]:
end_num += 1
count_num += 1
if count_num == len(df_1) - 1:
start_num += 1
end_num += 1
count_num += 1
break
for (a, b) in zip(valid_dict, [start_num, end_num, count_num]):
valid_dict[a].append(b)
# If the sum of start_count and the sum of end_count equals the length of the dataset,
# that would mean that dates are changing monotonically throughout the entire dataset
# and thus the results are correct.
if sum(valid_dict["Start Count"]) == len(df) & sum(valid_dict["End Count"]) == len(df):
print("Start Count and End Count match dataset length")
else:
print("Start Count and End Count do not match dataset length")
return valid_dict
if __name__ == "__main__":
letters = list(string.ascii_uppercase)
lambda_func = lambda x: f"0{x}" if x < 10 else str(x)
df = pd.DataFrame({
"Customer Id" : [letters[np.random.randint(len(letters[0:6]))] for i in range(100)],
"Delinquency Start Date" : [pd.to_datetime(f"{np.random.randint(2014, 2015)}-{lambda_func(np.random.randint(1, 12))}-{lambda_func(np.random.randint(1, 29))}") for i in range(100)],
"Delinquency End Date" : [pd.to_datetime(f"{np.random.randint(2016, 2017)}-{lambda_func(np.random.randint(1, 12))}-{lambda_func(np.random.randint(1, 29))}") for i in range(100)]
})
# df = pd.DataFrame({
# "Customer Id" : ["A", "A", "A"],
# "Delinquency Start Date" : ["2018-01-31", "2018-03-31", "2018-04-15"],
# "Delinquency End Date" : ["2018-02-14", "2018-07-02", "2018-05-13"]
# })
new_df = time_processor(df, "Delinquency Start Date", "Delinquency End Date", "Customer Id")
valid_dict = validation_func(new_df, "True Delinquency Start Date", "True Delinquency End Date", "Customer Id")
new_df["Del. Days"] = ((pd.to_datetime(new_df["True Delinquency End Date"]) - pd.to_datetime(new_df["True Delinquency Start Date"])).astype(str).str.zfill(10).str[0:5]).astype(int)
... View more