Spaces:
Runtime error
Runtime error
File size: 7,380 Bytes
75660bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import pandas as pd
import lib.utils as libPaths
import lib.claims as libClaims
from lib.models import mdl_utils, mdl_xgb, mdl_logR, mdl_svm
from lib.models import mdl_autoenc, mdl_kmeans
import sys
m_blnTraceOn = True
m_blnTrace2On = False
#--- load, merge data from file
m_kstrDataPath = libPaths.pth_data
m_kstrModelPath = libPaths.pth_model
m_kstrBinModelPath = libPaths.pth_binModels
def load_providers(blnIsTrain=False):
pdfClaims = libClaims.loadPkl_claims(blnIsTrain)
pdfClaims = pdfClaims.drop(['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
'Gender', 'Race', 'County'], axis=1)
pdfProviders = pdfClaims.groupby(['Provider'], as_index=False).agg('sum')
return pdfProviders
#--- feat eng
def do_featEng(pdfClaimsFeatEng, blnIsTrain=False):
if (m_blnTraceOn): print("TRACE (providers.doFeatEng): blnIsTrain, ", blnIsTrain)
pdfFeatEng = pdfClaimsFeatEng
#--- add new features to assist with predictions
pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
return pdfFeatEng
def get_logrPredict(pdfTestClaims):
#--- logistic regression predictions; load test data
pdfClaims = pdfTestClaims
#print("INFO (providers.get_logrPredict) pdfClaims.shape): ", pdfClaims.shape)
pdfFeatEng = do_featEng(pdfClaims, False)
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
ndaPredict = mdl_logR.predict(npaScaled)
#print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
pdfPredict = pd.DataFrame(ndaPredict)
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
#--- stitch the grouped data with the labels
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
return pdfResults
def get_svmPredict(pdfTestClaims):
#--- support vector machine predictions; load test data
pdfClaims = pdfTestClaims
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfClaims.shape: ", pdfClaims.shape)
pdfFeatEng = do_featEng(pdfClaims, False)
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaScaled.shape: ", npaScaled.shape)
ndaPredict = mdl_svm.predict(npaScaled)
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaPredict.shape: ", ndaPredict.shape)
pdfPredict = pd.DataFrame(ndaPredict)
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfPredict.shape: ", pdfPredict.shape)
#--- stitch the grouped data with the labels
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfResults.shape: ", pdfResults.shape)
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
return pdfResults
def get_xgbPredict(pdfTestClaims):
try:
#--- load test data
pdfClaims = pdfTestClaims
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfClaims.shape): ", pdfClaims.shape)
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doFeatEng (provider) ... ")
pdfFeatEng = do_featEng(pdfClaims, False)
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler ... ")
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler_toPdf ... ")
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
#if (m_blnTraceOn): print("TRACE (predict.npaScaled.shape1): ", npaScaled.shape)
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) run prediction ... ")
ndaPredict = mdl_xgb.predict(npaScaled)
#if (m_blnTraceOn): print("TRACE (predict.npaPredict.shape2): ", ndaPredict.shape)
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) convert to dataframe ... ")
pdfPredict = pd.DataFrame(ndaPredict)
pdfAnoms = pdfPredict[pdfPredict[0] > 0]
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfPredict.shape: ", pdfPredict.shape)
if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) #anoms: ", len(pdfAnoms.index))
#--- group data by provider
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) group claims by provider ... ")
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfResults.shape: ", pdfResults.shape)
#--- stitch the grouped data with the labels
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) merge labels into dataset ... ")
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
except:
e = sys.exc_info()
print("ERROR (providers.get_xgbPredict_genError): ", e)
if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) proc complete; return ... ")
return pdfResults
def get_encPredict(pdfTestClaims):
#--- principal component analysis predictions; load test data
pdfClaims = pdfTestClaims
if (m_blnTraceOn): print("TRACE (providers.get_encPredict) ppdfClaims.shape: ", pdfClaims.shape)
pdfFeatEng = do_featEng(pdfClaims, False) #--- not grouped by provider
#--- perform standard scaling; get fit then transform
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False) #--- grouped by provider
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
#--- perform PCA; then autoencode predict
ndaPredict = mdl_autoenc.predict(pdfScaled)
#print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
pdfPredict = pd.DataFrame(ndaPredict)
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
#--- stitch the grouped data with the labels
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
return pdfResults
|