Source code for utils

import pandas as pd
import requests
import time
import re
from tqdm import tqdm


######################################################################
"------------------------------Molport-------------------------------"
######################################################################


# Collects molport ids from the given list smiles (we will use this ids to search)
def molport_get_ids(instance, smiles_list):
    # List to store t he IDs and SMILES
    id_smiles_list = []

    molport_username = instance.login['molport_username']
    molport_password = instance.login['molport_password']
    molport_api_key = instance.login['molport_api_key']
        
    for smiles in tqdm(smiles_list):
        # Data to send to the Molport server
        if molport_username != None:
            payload = {
                "User Name": molport_username,
                "Authentication Code": molport_password,
                "Structure": smiles,
                "Search Type": 5, # Perfect research
                "Maximum Search Time": 60000,
                "Maximum Result Count": 1000,
                "Chemical Similarity Index": 0.9
            }

        else:
            payload = {
                "API Key": molport_api_key,
                "Structure": smiles,
                "Search Type": 5, # Perfect research
                "Maximum Search Time": 60000,
                "Maximum Result Count": 1000,
                "Chemical Similarity Index": 0.9
            }

        # Send the request to the Molport server
        r = requests.post('https://api.molport.com/api/chemical-search/search', json=payload)

        # Get the Python dictionary from the server response
        response = r.json()

        if response["Result"]["Status"] == 1:
            molecules = response["Data"]["Molecules"]

            # Iterate over the molecules and their information
            for molecule in molecules:
                molecule_id = molecule["Id"]
                id_smiles_list.append((molecule_id, smiles))

    df = pd.DataFrame(id_smiles_list, columns=["ID", "Input SMILES"])

    return df


######################################################################
######################################################################


#cleans the purity text
def process_purity(value):
    if value == "('',)":
        return ""
    else:
        return value.strip('\'(%\'),')

# standardize the price and purity columns with chemspace data   
def molport_standardize_columns(data):
    
    data = data.astype(str)
    
    # Apply the custom function to the 'Purity' column
    data['Purity'] = data['Purity'].apply(process_purity)

    # Create new columns Price_USD and Price_EUR with empty strings
    data['Price_USD'] = ""

    # Replace relevant values with corresponding prices
    data.loc[data['Currency'] == 'USD', 'Price_USD'] = data.loc[data['Currency'] == 'USD', 'Price']

    # Remove the Price and Currency columns
    data.drop(['Price', 'Currency'], axis=1, inplace=True)
    
    return data


######################################################################
######################################################################


# Collects prices for the given ids and coverts them into dataframe
[docs]def molport_collect_prices(instance, molecule_ids):
    """
    Collects price data for molecules from Molport API.

    :param instance: The PriceCollector instance containing API credentials.
    :param molecule_ids: DataFrame containing molecule IDs and SMILES.
    :type instance: PriceCollector
    :type molecule_ids: pandas.DataFrame
    :return: DataFrame containing collected price data.
    :rtype: pandas.DataFrame
    """
    all_molecules_data = []

    molport_username = instance.login['molport_username']
    molport_password = instance.login['molport_password']
    molport_api_key = instance.login['molport_api_key']

    for _, row in tqdm(molecule_ids.iterrows(),total=len(molecule_ids)):
        molecule_id = row['ID']
        smiles = row['Input SMILES']

        if molport_username != None:
            # Molport API URL using the API key and molecule ID
            url = f'https://api.molport.com/api/molecule/load?molecule={molecule_id}&username={molport_username}&authenticationcode={molport_password}'
        else:
            url = f'https://api.molport.com/api/molecule/load?molecule={molecule_id}&apikey={molport_api_key}'

        # Send the POST request to the Molport API
        response = requests.post(url)

        # Check the response status
        if response.status_code == 200:
            # The request was successful
            data = response.json()
            data['Data']['Molecule']['Input SMILES'] = smiles
            all_molecules_data.append(data)
        else:
            # The request failed
            print(f'Error in the request for molecule {molecule_id}: {response.status_code}')


    molport_data = []
    
    for data_ in all_molecules_data:
            input_smiles = data_['Data']['Molecule']['Input SMILES']
            smiles = data_['Data']['Molecule']['SMILES']
            supplier_data = data_["Data"]["Molecule"]["Catalogues"]["Screening Block Suppliers"]

            # Write each data row
            for supplier in supplier_data:
                supplier_name = supplier["Supplier Name"]
                catalogues = supplier["Catalogues"]

                for catalogue in catalogues:
                    purity = catalogue.get("Purity", ""),

                    last_update_date = catalogue.get("Last Update Date Exact", "")
                    packings = catalogue["Available Packings"]

                    for packing in packings:
                        
                        source = "Molport"
                        amount = packing.get("Amount", "")
                        measure = packing.get("Measure", "")
                        price = packing.get("Price", "")
                        currency = packing.get("Currency", "")
                        
                    molport_data.append((source, input_smiles, smiles, last_update_date, supplier_name, purity, price, amount, measure, currency))

    # Create a DataFrame with collected data
    df = pd.DataFrame(molport_data, columns=["Source", "Input SMILES", "SMILES", "Last Update Date Exact", "Supplier Name", "Purity", "Price", "Amount", "Measure", "Currency"])

    # read the file again
    df = molport_standardize_columns(df)

    #remove if no price rows
    df = df.dropna(subset=["Price_USD"], how='all')
    return df


######################################################################
"----------------------------ChemSpace-------------------------------"
######################################################################


# requires api_key
def chemspace_get_token(instance):

    chemspace_api_key = instance.login['chemspace_api_key']

    url = "https://api.chem-space.com/auth/token"
    headers = {
        "Authorization": f"Bearer {chemspace_api_key}"
    }

    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Retrieve the access token from the response
        access_token = response.json()["access_token"]

        return access_token
    else:
        # The request failed, print the status code and response content
        print("The request failed with the status code:", response.status_code)
        return None
    

######################################################################
######################################################################


# Collects prices for the given SMILES and coverts them into dataframe
[docs]def chemspace_collect_prices(instance, smiles_list):
    """
    Collects price data for molecules from ChemSpace API.

    :param instance: The PriceCollector instance containing API credentials.
    :param smiles_list: list containing molecule SMILES.
    :type instance: PriceCollector
    :type smiles_list: list
    :return: DataFrame containing collected price data.
    :rtype: pandas.DataFrame
    """

    access_token = chemspace_get_token(instance)
    url = "https://api.chem-space.com/v3/search/exact"
    headers = {
        "Accept": "application/json; version=3.1",
        "Authorization": "Bearer " + access_token,
    }
    params = {
        "count": 3,
        "page": 1,
        "categories": "CSCS,CSMB"
    }

    response_data = []

    for index, smiles in tqdm(enumerate(smiles_list),total=len(smiles_list)):
        data = {
            "SMILES": smiles
        }

        response = requests.post(url, headers=headers, data=data, params=params)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Process the response here
            molecule_data = response.json()

            # original smiles added
            for item in molecule_data['items']:
                item['input smiles'] = smiles

            response_data.append(molecule_data)
        else:
            # The request failed, print the status code and response content
            print("Request failed with status code:", response.status_code)
            print("Response content:", response.text)

        # Pause for 1.5 seconds between each request
        if index < len(smiles_list) - 1:
            time.sleep(1.5)

    chemspace_data = []
    
    # Iterate through the elements of the JSON file
    for data in response_data:
        for item in data['items']:
            for offer in item['offers']:
                for price in offer['prices']:
                    
                    source = "ChemSpace"
                    input_smiles = item['input smiles']
                    smiles = item["smiles"]
                    cas = item["cas"]
                    supplier_name = offer['vendorName']
                    purity = offer['purity']
                    amount = price['pack']
                    measure = price['uom']
                    price_usd = price['priceUsd']

                    chemspace_data.append((source, input_smiles, smiles, cas, supplier_name, purity, amount, measure, price_usd))
                    
    df = pd.DataFrame(chemspace_data, columns=["Source", "Input SMILES", "SMILES", "CAS", "Supplier Name", "Purity", "Amount", "Measure", "Price_USD"])
    df = df.dropna(subset=["Price_USD"], how='all')

    return df


######################################################################
"-------------------------------MCule--------------------------------"
######################################################################


# Function to collect MCule IDs with respect to limits
def mcule_get_ids(instance, smiles_list):
    
    mcule_token = instance.login['mcule_api_key']
    
    id_smiles_list = []

    headers = {
        'Authorization': 'Token ' + mcule_token,
    }

    # Iterate through smiles_list while respecting the limits
    for i in range(0, len(smiles_list), 500):  # Process 500 SMILES at a time
        batch_smiles = smiles_list[i:i+500]  # Extract a batch of SMILES

        data = {
            'queries': batch_smiles
        }

        # Send a POST request to MCule API for exact search
        response = requests.post('https://mcule.com/api/v1/search/exact/', headers=headers, json=data)

        if response.status_code == 200:
            results = response.json()["results"]

            # Extract MCule IDs and corresponding SMILES
            for result in results:
                molecule_id = result["mcule_id"]
                query = result["query"]
                id_smiles_list.append((molecule_id, query))

    # Create a DataFrame with collected data
    df = pd.DataFrame(id_smiles_list, columns=["ID", "Input SMILES"])
    return df



# Function to build packages for multiple amounts
def build_packages(instance, df):
    
    mcule_token = instance.login['mcule_api_key']
    
    if df.empty:
        return
    # Define the API URL
    url = "https://mcule.com/api/v1/iquote-queries/"

    # Headers for authorization
    headers = {
        "Authorization": "Token " + mcule_token,
        "Content-Type": "application/json",
        "Accept": "application/json, */*",
        "Accept-Encoding": "gzip, deflate"
    }

    package_ids = []  # List to store package IDs

    amount_list = ["1", "5", "10", "100", "1000", "10000", "100000", "1000000"]

    for index,amount in tqdm(enumerate(amount_list),total=len(amount_list)):
        # Request body in JSON format
        data = {
            "amount": amount,
            "customer_first_name": "John",
            "customer_last_name": "Doe",
            "delivery_country": "US",
            "mcule_ids": df["ID"].tolist(),
            "min_amount": None
        }

        # Send the POST request
        response = requests.post(url, json=data, headers=headers)

        # Check the response
        if response.status_code == 201:  # Status code 201 for Created
            results = response.json()
            package_id = results["id"]
            package_ids.append(package_id)  # Add package ID to the list
        else:
            print("POST request failed for amount:", amount)
            print("Response code:", response.status_code)
            print(response.text)

    return package_ids


######################################################################
######################################################################


# Function to get quotes
def get_quotes(token, quote):
    for quote_data in quote.get('group', {}).get('quotes', []):
        quote_id = quote_data['id']
        headers = {
            'Authorization': f'Token {token}',
            'Content-Type': 'application/json',
        }
        url = f'https://mcule.com/api/v1/iquotes/{quote_id}/'
        response = requests.get(url, headers=headers)
        yield response.json()


# Function to collect prices and data from MCule API
[docs]def mcule_collect_prices(instance, package_ids):
    """
    Collects price data for molecules from MCule API.

    :param instance: The PriceCollector instance containing API credentials.
    :param package_ids: list containing molecule package IDs.
    :type instance: PriceCollector
    :type package_ids: list
    :return: DataFrame containing collected price data.
    :rtype: pandas.DataFrame
    """
    
    token = instance.login['mcule_api_key']
    
    data = []
    
    if package_ids is None:
        # Create a DataFrame with collected data
        df = pd.DataFrame(data, columns=["Source", "ID", "Supplier Name", "SMILES", "Purity", "Price_USD", "Amount", "Measure"])
        return df

    # Define headers for authorization
    headers = {
        'Authorization': f'Token {token}',
        'Content-Type': 'application/json',
    }

    for index, package_id in tqdm(enumerate(package_ids),total=len(package_ids)):

        # Construct the URL for the specific quote request
        url = f'https://mcule.com/api/v1/iquote-queries/{package_id}/'

        # Function to check the status of the quote request
        def check_status():
            response_package = requests.get(url, headers=headers).json()
            status = response_package['state']
            if status == 40:
                return None
            elif status == 30 and response_package['group']:
                return response_package
            elif status == 30 and not response_package['group']:
                return None
            else:
                return 1

        response_package = check_status()
        while response_package == 1:
            time.sleep(0.5)
            response_package = check_status()

        if response_package is None:
            continue

        for quote in get_quotes(token, response_package):
            # Extract values from each product item in the quote
            product_items = quote.get('items', [])

            for item in product_items:
                source = "MCule"
                mcule_id = item.get('structure_origin_mcule_id')
                product_supplier_name = item.get('product_supplier_name')
                smiles = item.get('product_smiles')
                purity = item.get('product_purity')
                price = item.get('product_price')
                amount = item.get('amount')
                measure = "mg"
                data.append((source, mcule_id, product_supplier_name, smiles, purity, price, amount, measure))

    # Create a DataFrame with collected data
    df = pd.DataFrame(data, columns=["Source", "ID", "Supplier Name", "SMILES", "Purity", "Price_USD", "Amount", "Measure"])

    return df

# Merges two dataframes
def add_input_smiles_columns(df1, df2):

    # Common columns to use for merging
    common_columns = ['ID']

    # Convert columns to compatible data types
    df1 = df1.astype(str)
    df2 = df2.astype(str)

    # Merge the two dataframes using the common columns
    merged_df = pd.merge(df1, df2, on=common_columns, how='outer')

    # Sort dataframe
    merged_df = merged_df.sort_values(by=['Input SMILES'])
    merged_df.drop("ID", axis=1, inplace=True)
    merged_df = merged_df.dropna(subset=['SMILES'])
    merged_df = merged_df.drop_duplicates()

    return merged_df


######################################################################
"--------------------------Data operation----------------------------"
######################################################################


def merge_dataframes(df_list):
    # Common columns to use for merging
    common_columns = ['Source', 'Input SMILES', 'SMILES', 'Supplier Name', 'Purity', 'Amount', 'Measure', 'Price_USD']

    # Initialize an empty dataframe to store the merged results
    merged_df = pd.DataFrame(columns=common_columns)

    # Convert columns to compatible data types for all dataframes in the list
    for i in range(len(df_list)):
        df_list[i] = df_list[i].astype(str)

    # Merge all dataframes in the list using the common columns
    for df in df_list:
        merged_df = pd.merge(merged_df, df, on=common_columns, how='outer')

    # Sort dataframe
    merged_df = merged_df.sort_values(by=['Input SMILES'])

    # Save the merged dataframe to a new CSV file
    # merged_df.to_csv("merged_prices.csv", index=False)

    return merged_df


######################################################################
######################################################################


# Define conversion factors for different measures
conversion_factors = {
    # Conversion to g
    'kg': 1000,
    'g': 1,
    'mg': 1 / 1000,
    'microg': 1 / 1000000,
    'ug': 1 / 1000000,

    # Conversion to mol
    'kmol': 1000,
    'mol': 1,
    'mmol': 1 / 1000,
    'micromol': 1 / 1000000,
    'umol': 1 / 1000000,

    # Conversion to l
    'kl': 1000,
    'l': 1,
    'ml': 1 / 1000,
    'mL': 1 / 1000,
    'microl': 1 / 1000000,
    'ul': 1 / 1000000,
}


######################################################################
######################################################################


# parses the units like 5x100g
def extract_unit_bulk(unit_string):
    # Extract the numeric part and unit from the unit string
    parts = re.search(r'(\d+)x(\d+)(\D+)', unit_string)
    if parts:
        bulk = int(parts.group(1)) * int(parts.group(2))
        unit = parts.group(3).lower()
        return bulk, unit
    else:
        bulk = re.search(r'\d+', unit_string)
        if bulk:
            bulk = int(bulk.group())
        else:
            return None, None

        unit = re.search(r'[a-zA-Z]+', unit_string)
        if unit:
            unit = unit.group().lower()
        else:
            return None, None

#Convert all prices into USD/g or USD/mol or USD/l
def standardize_prices(row):
    measure = row['Measure']
    amount = float(row['Amount'])
    price = float(row['Price_USD'])

    if measure in conversion_factors:
        return price / (conversion_factors[measure] * amount)
    else:
        bulk, unit = extract_unit_bulk(measure)
        if amount and unit:
            if unit in conversion_factors:
                return price / (conversion_factors[unit] * (amount * bulk))
        print("Unknown measure units for:",measure)
        return None
    
def add_standardized_columns(df):

    if df.empty:
        # Empty dataframe, add empty columns and save
        df['USD/g'] = ''
        df['USD/mol'] = ''
        df['USD/l'] = ''
        return df

    df['Measure'] = df['Measure'].astype(str)

    # Apply the function to create new columns
    df['USD/g'] = df.apply(lambda row: standardize_prices(row) if row['Measure'] in ['g', 'mg', 'kg', 'microg', 'ug' ] or re.match(r'\d+x\d+g', row['Measure']) else None, axis=1)
    df['USD/mol'] = df.apply(lambda row: standardize_prices(row) if row['Measure'] in ['mol', 'micromol', 'mmol', 'kmol', 'umol'] else None, axis=1)
    df['USD/l'] = df.apply(lambda row: standardize_prices(row) if (row['Measure'] in ['ml', 'microl', 'l', 'mL', 'kl', 'ul']) or re.match(r'\d+x\d+mL', row['Measure']) else None, axis=1)

    # Sort and Save the dataframe with the additional columns to a new CSV file
    df = df.sort_values(by=['Input SMILES', 'USD/g', 'USD/mol', 'USD/l'])
    # df.to_csv("standardized_merged_prices.csv", index=False)
    return df


######################################################################
######################################################################


def filter_csv_by_min_price(df):

    # Remove rows where neither of the two values (USD/g and USD/mol) is present
    df = df.dropna(subset=["USD/g", "USD/mol", "USD/l"], how='all')

    # Filter the rows from the initial dataframe, keeping only those corresponding to the smallest value of "Price_USD"
    filtered_df_g = df[df.groupby("Input SMILES")["USD/g"].transform(min) == df["USD/g"]]
    filtered_df_mol = df[df.groupby("Input SMILES")["USD/mol"].transform(min) == df["USD/mol"]]
    filtered_df_l = df[df.groupby("Input SMILES")["USD/l"].transform(min) == df["USD/l"]]

    # If multiple rows have the same price, keep the first one
    filtered_df_g = filtered_df_g.sample(frac=1).groupby("Input SMILES", as_index=False).first()
    filtered_df_mol = filtered_df_mol.sample(frac=1).groupby("Input SMILES", as_index=False).first()
    filtered_df_l = filtered_df_l.sample(frac=1).groupby("Input SMILES", as_index=False).first()

    # Combine the results using concatenation
    filtered_df = pd.concat([filtered_df_g, filtered_df_mol, filtered_df_l])

    filtered_df = filtered_df.sort_values(by=['Input SMILES', 'USD/g', 'USD/mol', 'USD/l'])
    
    return filtered_df


######################################################################
######################################################################


def collect_vendors(instance, smiles_list, progress_output=None, ChemSpace=True, Molport=True, MCule=True):
    
    time_start  = time.perf_counter()
    
    nb_integrator = sum([ChemSpace, Molport, MCule])
    progress = 0
    
    # List of selected suppliers
    selected_providers = []

    if Molport:
        # Get the molecule IDs and print count MolPort
        print(f"Collecting ID's for given {len(smiles_list)} SMILES from MolPort...")
        df_molecule_ids = molport_get_ids(instance, smiles_list)
        smiles_exists = df_molecule_ids['Input SMILES'].nunique()
        print(f"Total: {smiles_exists} molecules and {len(df_molecule_ids)} conformers are found in MolPort.\n")
        progress += 3/(4*nb_integrator)
        if progress_output is not None:
            progress_output.append(progress) 

        # Get the prices and print count from MolPort
        print(f"Collecting Prices for given {len(smiles_list)} IDs from MolPort...")
        molport_prices=molport_collect_prices(instance, df_molecule_ids)
        smiles_with_price = molport_prices.loc[molport_prices['Price_USD'].notnull(), 'Input SMILES'].nunique()
        print(f"Total: {len(molport_prices)} prices for {smiles_with_price} molecules are found in MolPort.\n")
        progress += 1/(4*nb_integrator)
        if progress_output is not None:
            progress_output.append(progress) 
        selected_providers.append(("Molport", molport_prices))

    if ChemSpace:
        # Get the prices and print count from ChemSpace
        print(f"Collecting Prices for given {len(smiles_list)} SMILES from ChemSpace...")
        chemspace_prices=chemspace_collect_prices(instance, smiles_list)
        unique_smiles_count = chemspace_prices['Input SMILES'].nunique()
        smiles_with_price_cs = len(chemspace_prices[chemspace_prices['Price_USD'].notnull()])
        print(f"Total: {smiles_with_price_cs} prices for {unique_smiles_count} molecules are found in ChemSpace.\n")
        progress += 1/nb_integrator
        if progress_output is not None:
            progress_output.append(progress) 
        selected_providers.append(("ChemSpace", chemspace_prices))
        
    if MCule:
        # Get the molecule IDs and print count MolPort
        print(f"Collecting ID's for given {len(smiles_list)} SMILES from MCule...")
        df_molecule_ids = mcule_get_ids(instance, smiles_list)
        smiles_exists = df_molecule_ids['Input SMILES'].nunique()
        package_id = build_packages(instance, df_molecule_ids)
        print(f"Total: {smiles_exists} molecules and {len(df_molecule_ids)} conformers are found in MCule.\n")
        progress += 1/(2*nb_integrator)
        if progress_output is not None:
            progress_output.append(progress) 

        # Get the prices and print count from MCule
        print(f"Collecting Prices for given {len(smiles_list)} IDs from MCule...")
        mcule_prices = mcule_collect_prices(instance, package_id)
        mcule_prices = add_input_smiles_columns(df_molecule_ids, mcule_prices)
        smiles_with_price = mcule_prices.loc[mcule_prices['Price_USD'].notnull(), 'Input SMILES'].nunique()
        print(f"Total: {len(mcule_prices)} prices for {smiles_with_price} molecules are found in MCule.\n")
        progress += 1/(2*nb_integrator)
        if progress_output is not None:
            progress_output.append(progress)  
        selected_providers.append(("MCule", mcule_prices))

    if selected_providers:
        name_providers = [row[0] for row in selected_providers]
        if len(name_providers) >= 2:
            all_providers = ", ".join(name_providers[:-1]) + " and " + name_providers[-1]
        else:
            all_providers = name_providers[0]
        print(f"Merging Results from {all_providers}...")
        merged_df = merge_dataframes([row[1] for row in selected_providers])
        unique_smiles_count_merged = merged_df['Input SMILES'].nunique()
        smiles_with_price_merged = len(merged_df.loc[merged_df['Price_USD'].notnull(), 'Input SMILES'])
        print(f"Total: {smiles_with_price_merged} prices for {unique_smiles_count_merged} molecules exist in the Merged file.\n")
    else:
        print(f"The credentials are missing or incorrect. You need to set credential for at least one integrator.")
        return pd.DataFrame([])

    time_end = time.perf_counter()
    print(f"Total time: {time_end - time_start:0.4f} seconds")
    print(f"Vendor price collection is successfully done!")
    
    if progress_output is not None:
        progress_output.append(1.0)  # Ensure completion
    
    return merged_df