o  gk@svddlmZmZddlZddlZddlZddlZddlm Z ddl Z ddl Z ddlZddlmZmZddlZddlmZmZddlZddlmZddlmZddlmZddlZddlmZdd lmZddlZdd lm Z m!Z!d d l"m#Z$d d l%m&Z'd dl(m)Z)ia*ia+ia,ddZ-d'ddZ.ddZ/ddZ0ddZ1ddZ2ddZ3ddZ4e-d(d!d"Z5d#d$Z6d%d&Z7dS))) roc_auc_score roc_curveN) XGBClassifier XGBRegressor)rmean_squared_error)SVR)LinearRegression) KernelRidge)TransformedTargetRegressor) MinMaxScaler) AutoTokenizer AutoModel)SELFIES)load) load_smi_tedc Csdddddddddd d dd dd d ddddddddddddddddddddddddddgadd d!d"d#d$d%d&d'd#d(d)d*d+d#d,d-d.d+d#gadS)/Nhivsmiles HIV_activedata/hiv2024-06-26 11:27:37DatasetInputOutputPath Timestampesol/ESOL predicted log solubility in mols per litre data/esol2024-06-26 11:31:46freesolvexpt data/freesolv2024-06-26 11:33:47lipoy data/lipo2024-06-26 11:34:37baceClass data/bace2024-06-26 11:36:40bbbpp_np data/bbbp2024-06-26 11:39:23clintoxCT_TOX data/clintox2024-06-26 11:42:43bart SELFIES-TED,BART model for string based SELFIES modality2024-06-21 12:32:20)Name Model Name Descriptionrmol-xl Molformer0MolFormer model for string based SMILES modality2024-06-21 12:35:56mhgMHG-GEDMolecular hypergraph model2024-07-10 00:09:42smi-tedSMI-TED"SMILES based encoder decoder model)datasetsmodelsrIrIt/Users/indrapriyadarsinis/Desktop/Indra/2024/codes/AD-demo/Oct-demo/hf_repo/final_demo/spaces/hf/fm4m/models/fm4m.pyavail_models_data%s   rKFcCsHdddddddddd d dd d d dga|rtSttjdddS)NrDrErF)r9r:r;r5r6r7r<r=r>r@rArBr9raxis)rHpd DataFramedroprawrIrIrJ avail_models8s    rScCs:tdd}t|aWdtS1swYtS)Ndownstream_models.jsonr)openjsonrdownstream_models)outfilerIrIrJavail_downstream_modelsIs   rZc Csjdddddddddd d dd dd d ddddddddddddddddddddddddddgatS)Nrrrrrrrrrr r!r"r#r$r%r&r'r(r)r*r+r,r-r.r/r0r1r2r3r4)rGrIrIrIrJavail_datasetsPs.r[c Cszdddddddddd d dd dd d ddddddddddddddddddddddddddg}dd d!d"d#d$d%d"d&d'd(d"d)d*d(d"d+d,d-d"d.d/d0d"g}d1d2d3d"d4d5d6d"d7d8d9d"d:d;d}t||Wd?n1swYtd@d>}t||Wd?n1swYtdAd>}t||Wd?d?S1swYd?S)Ba'datasets = {"esol": ["smiles", "ESOL predicted log solubility in mols per litre", "data/esol", "2024-06-26 11:36:46.509324"], "freesolv": ["smiles", "expt", "data/freesolv", "2024-06-26 11:37:37.393273"], "lipo": ["smiles", "y", "data/lipo", "2024-06-26 11:37:37.393273"], "hiv": ["smiles", "HIV_active", "data/hiv", "2024-06-26 11:37:37.393273"], "bace": ["smiles", "Class", "data/bace", "2024-06-26 11:38:40.058354"], "bbbp": ["smiles", "p_np", "data/bbbp","2024-06-26 11:38:40.058354"], "clintox": ["smiles", "CT_TOX", "data/clintox","2024-06-26 11:38:40.058354"], "sider": ["smiles","1:", "data/sider","2024-06-26 11:38:40.058354"], "tox21": ["smiles",":-2", "data/tox21","2024-06-26 11:38:40.058354"] }rrrrrrrrrr r!r"r#r$r%r&r'r(r)r*r+r,r-r.r/r0r1r2r3r4r5r7r8)r9r;rr<r>r?r@ZMHGrCzspec-gruzSpectrum modality with GRUz spec-lstmzSpectrum modality with LSTMz2024-07-10 00:09:54z3d-vaezVAE model for 3D atom positionsz2024-07-10 00:10:08rzXG Boost Classifierz2024-06-21 12:31:20rzXG Boost Regressorz2024-06-21 12:32:56z2-FNNzA two layer feedforward networkz2024-06-24 14:34:16z3-FNNz!A three layer feedforward networkz2024-06-24 14:38:37 datasets.jsonwN models.jsonrT)rVrWdump)rGrHrXrYrIrIrJresetdsR         "r`cCs@tdd}tt|Wdn1swYtdS)Nr\r])rVrWr_rGrK)Z list_datarYrIrIrJupdate_data_list  racC@tdd}t||Wdn1swYtdS)Nr^r]rVrWr_rKZ list_modelrYrIrIrJupdate_model_listrbrfcCrc)NrTr]rdrerIrIrJupdate_downstream_model_listrbrgTc Csjdddddd}||vr||}|dkrStd}t||}t|}||}t|} Wdn1s>wY|sOt|}t| } || fS|dkrpt }||j||d}|j||d} || fS|dkrt dd d }t|j||d }|j||d } Wd|| fS1swY|| fS|dkr1t j d d d d}t j d d d} t|tkr| |d dd} n | t|jd dd} t|di| } Wdn1swY| j}t|tkr| |d dd} n | t|jd dd} t|di| } Wdn 1swY| j} |s1t|}t| } || fS)Nr@r5r<rD)rAr6 MolFormerr=rEzAmodels/mhg_model/pickles/mhggnn_pretrained_model_0724_2023.pickle) return_tensorz./models/smi_ted/smi_ted_lightzsmi-ted-Light_40.pt)folderZ ckpt_filename)Z return_torchzibm/MoLFormer-XL-both-10pctT)Zdeterministic_evaltrust_remote_code)rkpt)paddingreturn_tensorsrI)keysr@rtorchno_gradencodestackrNrOr5rr from_pretrainedr typelistvalues pooler_output) train_data test_data model_typerialiasmodelZ train_embx_batchZtest_emb x_batch_test tokenizerinputsoutputsrIrIrJget_representationsj         .&   ""       rc! st|ddddd}tdd}t|}tt|dj||t|djvr7||vr4||}n |}ntd dSt}t|}tt|d j|t|d jvr|}td |d |d d} t | \} } } } Wdn1sywYtdnAtd| d}t |d|d}t |d|d}t |d|d} t |d|d} t |||\} } tdtd|dkrtd6i|}|| | || dddf}t| |}t| |\}}}td|dz&td|d |d d} t | \}}Wdn 1swYWnQtdtjddddddd }td!t| }|| d|}| jd|fd"d#ttD}fd$d#ttD}||}||}td%Yd|d}||||||fS|d&kr4t}|| | || dddf}t| |}t| |\}}}td|dz&td|d |d d} t | \}}Wdn 1swYWnQtdtjddddddd }td!t| }|| d|}| jd|fd'd#ttD}fd(d#ttD}||}||}td%Yd|d}||||||fS|d)krtd6i|}t|td*d+d,| | }|| }tt| |} td-| dd-| d}tdtjddddddd }td!t| }|| d|}| jd||}|}td%|| | |||fS|d.kr t d6i|}t|td*d+d,| | }|| }tt| |} td-| dd-| d}tdtjddddddd }td!t| }|| d|}| jd||}|}td%|| | |||fS|d/krut!d6i|}t|td*d+d,| | }|| }tt| |} td-| dd-| d}tdtjddddddd }td!t| }|| d|}| jd||}|}td%|| | |||fS|d0krtd1dd2d3d4d5}t|td*d+d,| | }|| }tt| |} td-| dd-| d}tdtjddddddd }td!t| }|| d|}| jd||}|}td%|| | |||fSdS)7Nr@r5r<rDrAr6rhrETrQr9Model not availabler./representation/_.pklrbz# Representation loaded successfullyzCustom Dataset,rr Calculating ROC AUC Score ...rROC-AUC Score: .4f ./plot_emb/Generating latent plots euclidean 皙?Fmetric n_neighbors n_components low_memorymin_distverbosecg|] }|dkr|qSrrI.0indexxrIrJ 8z single_modal..crrrIrrrIrJr9rGenerating latent plots : DoneDefaultClassifiercrrrIrrrIrJrXrcrrrIrrrIrJrYrrr feature_range regressor transformer RMSE Score: Kernel RidgeLinear RegressionDefaultRegressorrbfscale{Gz?kerneldegreeCgammaepsilonrI)"printrSrNrOrvrwror[rVpicklersplitread_csvrrfit predict_probarrumapUMAPnpminimumlen fit_transformrangerr r predictsqrtrr r)!r}datasetdownstream_modelparamsr|datadfr{taskf1r~y_batchr y_batch_test componentsryrzxgb_predict_concaty_probroc_aucfprtprrclass_0class_1reducer n_samples features_umapindex_0index_1resultr RMSE_scorerIrrJ single_modalsT                             rc) st|t}t|}t|dj|t|djvr!|}d}n7d}|d}t|d|d} t|d|d} t|d|d} t|d|d} td tdd }t|}t|d jd d ddd} t | t| r6t |D]\}}|| vr| |}n|}|dkr|rt d|d|dd}t|\}} }} Wdn1swYtd|d|dqt| | |\}}t|}t|}q|rt d|d|dd}t|\}}}}td|d|dWdn 1s wYnt| | |\}}t|}t|}tj||gdd}tj||gdd}qntddS|jd}ddt|D|_|jd}ddt|D|_tdz#t d|dd}t|\}}Wdn 1s{wYWn[tdtjdd ddd!dd"}td#t|}||d|}d$|vr| jd|fd%dttD}fd&dttD}||}||}n|}|}td'Ytd(|d)kr)td:i|} | || | |dddf}!t| |!}"t| |!\}#}$}%td*|"d+td*|"d+d*|"d+}&|&|"|#|$||fS|d,krmt} | || | |dddf}!t| |!}"t| |!\}#}$}%td*|"d+td*|"d+d*|"d+}&|&|"|#|$||fS|d-krt d:i|}'t!|'t"d.d/d0|| }|#|}!t$t%| |!}(td1|(d+d1|(d+}&|&|(| |!||fS|d2krt&d:i|}'t!|'t"d.d/d0|| }|#|}!t$t%| |!}(td1|(d+d1|(d+}&|&|(| |!||fS|d3kr!t'd:i|}'t!|'t"d.d/d0|| }|#|}!t$t%| |!}(td1|(d+d1|(d+}&|&|(| |!||fS|d4kr_t d5dd6d7d8d9}'t!|'t"d.d/d0|| }|#|}!t$t%| |!}(td1|(d+d1|(d+}&|&|(| |!||fSdS);NrTFrrrrrzCustom Dataset loadedrQr9r@r5r<rDrrrrrz Loaded representation/rLrcSg|]}|dqSrrIrirIrIrJrzmulti_modal..cSrrrIrrIrIrJrrz#Representations loaded successfullyrz _multi.pklrrrrrr ClassifiercrrrIrrrIrJr&rcrrrIrrrIrJr'rrrrrrrrrrrrrrrrrrrrrI)(rr[rNrOrvrwrrrSsetissubsetro enumeraterVrrrconcatshapercolumnsrrrrrrrrrrrrr r rrrrr )) model_listrrrrrr predefinedrryrzrrr|rr}r{rr~rZ x_batch_1Z y_batch_1Zx_batch_test_1Zy_batch_test_1 num_columnsrrrrrrrrrrrrrrrrrIrrJ multi_modals                                   r)F)T)8sklearn.metricsrrdatetimeosrnumpyrmatplotlib.pyplotpyplotpltpandasrNrrWxgboostrrxgbr sklearn.svmrsklearn.linear_modelrsklearn.kernel_ridger Zsklearn.composer Zsklearn.preprocessingr rp transformersr r Zselfies_model.loadrr5Z mhg_modelrr@Zsmi_ted.smi_ted_light.loadrrGrHrXrKrSrZr[r`rarfrgrrrrIrIrIrJsP          7 ? a