datajoely
06/27/2022, 6:23 PMsjster
06/27/2022, 6:43 PMdatajoely
06/27/2022, 6:46 PMsjster
06/27/2022, 6:52 PMdatajoely
06/27/2022, 6:52 PMsjster
06/27/2022, 6:57 PMyaml
joined_es_tu_target:
type: PartitionedDataSet
dataset:
type: pandas.ParquetDataSet
save_args:
index: False
path: data/03_primary/joined_es_tu_target
filename_suffix: ".parquet"
datajoely
06/27/2022, 7:01 PMsjster
06/27/2022, 7:05 PMpython
def read_inputs_join(ev: pd.DataFrame, tu: pd.DataFrame, df_target: pd.DataFrame) -> pd.DataFrame:
print(ev.head())
print(tu.head())
ev.set_index('IDENTITY_ID', inplace=True)
tu.set_index('IDENTITY_ID', inplace=True)
df_target.set_index('IDENTITY_ID', inplace=True)
log = logging.getLogger(__name__)
log.info(f"Length of ev is {len(ev)}")
log.info(f"Length of tu is {len(tu)}")
log.info(f"Length of target is {len(df_target)}")
df_joined = ev.join(tu, how='inner')
df_ev_tu_target = df_target.join(df_joined, how='inner')
print("Target type is ",type(df_ev_tu_target))
print("Target columns ",df_ev_tu_target.columns)
print("Target columns ",df_ev_tu_target.head())
print("Average age of credit ",df_ev_tu_target['AVG_AGE_OF_CREDIT'])
log.info(f"Length of target joined with es_tu is {len(df_ev_tu_target)}")
return(df_ev_tu_target)
datajoely
06/27/2022, 7:22 PMsjster
06/27/2022, 7:29 PM