Benchmarking the querier's verbs

Today, give a try to Techtonique web app, a tool designed to help you make informed, data-driven decisions using Mathematics, Statistics, Machine Learning, and Data Visualization. Here is a tutorial with audio, video, code, and slides: https://moudiki2.gumroad.com/l/nrhgb. 100 API requests are now (and forever) offered to every user every month, no matter the pricing tier.

The querier is a query language for Python pandas Data Frames, inspired by relational databases querying. There are also new ways of using pandas Data Frames for optimizing performance, such as Dask or modin. I’m considering an integration of the querierwith them, and the first step in this direction, was for me to understand the querier’s perfomance itself.

Average timings in seconds:

image-title-here

The full code for these benchmarks is:

From Terminal, install the package:

pip install git+https://github.com/Techtonique/querier.git

In Python:

import pandas as pd
import numpy as np
from time import time
from  tqdm import tqdm

import querier as qr

# Ex1: summarize

# n: 100 to 1000000
# p: 10 to 100
n_range = [int(x) for x in np.linspace(start=100, stop=1e6, num=10)]
p_range = [int(x) for x in np.linspace(start=10, stop=1e2, num=10)]
res = np.zeros((len(n_range), len(p_range)))

for idx, n in tqdm(enumerate(n_range)):
    
    print(f" idx = {idx} -----")
    print("\n")
    
    for idy, p in tqdm(enumerate(p_range)):
    
        np.random.seed(123)
        df1 = pd.DataFrame(np.random.randint(0, 10, size=(n, p)), 
                      columns=['v' + str(i) for i in range(p)])
        col_group1 = 'v' + str(p)
        col_group2 = 'v' + str(p-1)
        df1[col_group1] = np.random.choice(a=("choice1", "choice2"), size=n)
        df1[col_group2] = np.random.choice(a=("choice3", "choice4"), size=n)
        
        start = time()
        
        [qr.summarize(df1, req = "avg(v1), avg(v2),"+col_group1+","+col_group2, 
              group_by = col_group1+","+col_group2) for _ in range(10)]                
        
        res[idx, idy] = time() - start
        
        qr.summarize.cache.clear()

np.min(res)
np.max(res)
np.savetxt("summarize.csv", res/10, delimiter=",")


# Ex2: filter

res2 = np.zeros((len(n_range), len(p_range)))

for idx, n in tqdm(enumerate(n_range)):
    
    print(f" idx = {idx} -----")
    print("\n")
    
    for idy, p in tqdm(enumerate(p_range)):
    
        np.random.seed(123)
        df1 = pd.DataFrame(np.random.randint(0, 10, size=(n, p)), 
                      columns=['v' + str(i) for i in range(p)])
        col_group1 = 'v' + str(p)
        col_group2 = 'v' + str(p-1)
        df1[col_group1] = np.random.choice(a=("choice1", "choice2"), size=n)
        df1[col_group2] = np.random.choice(a=("choice3", "choice4"), size=n)
        
        start = time()
        
        [qr.filtr(df1, req = "(" + col_group1 + "== 'choice1')" + " & " + "(" + col_group2 + "== 'choice4')") for _ in range(10)]                
        
        res2[idx, idy] = time() - start
        
        qr.filtr.cache.clear()


np.min(res2)
np.max(res2)
np.savetxt("filtr.csv", res2/10, delimiter=",")


# Ex3: select

res3 = np.zeros((len(n_range), len(p_range)))

for idx, n in tqdm(enumerate(n_range)):
    
    print(f" idx = {idx} -----")
    print("\n")
    
    for idy, p in tqdm(enumerate(p_range)):
    
        np.random.seed(123)
        df1 = pd.DataFrame(np.random.randint(0, 10, size=(n, p)), 
                      columns=['v' + str(i) for i in range(p)])
        col_group1 = 'v' + str(p)
        col_group2 = 'v' + str(p-1)
        df1[col_group1] = np.random.choice(a=("choice1", "choice2"), size=n)
        df1[col_group2] = np.random.choice(a=("choice3", "choice4"), size=n)
        
        start = time()
        
        [qr.select(df1, req = col_group1 + ", " + col_group2) for _ in range(10)]                
        
        res3[idx, idy] = time() - start
        
        qr.select.cache.clear()


np.min(res3)
np.max(res3)
np.savetxt("select.csv", res3/10, delimiter=",")


# Ex4: join

n_range = [int(x) for x in np.linspace(start=100, stop=1e4, num=10)]
p_range = [int(x) for x in np.linspace(start=10, stop=1e2, num=10)]

res4 = np.zeros((len(n_range), len(p_range)))

for idx, n in tqdm(enumerate(n_range)):
    
    print(f" idx = {idx} -----")
    print("\n")
    
    for idy, p in tqdm(enumerate(p_range)):
    
        np.random.seed(123)
        df1 = pd.DataFrame(np.random.randint(0, 10, size=(n, p)), 
                      columns=['v' + str(i) for i in range(p)])
        col_group1 = 'v' + str(p)
        col_group2 = 'v' + str(p-1)
        df1[col_group1] = np.random.choice(a=("choice1", "choice2"), size=n)
        df1[col_group2] = np.random.choice(a=("choice3", "choice4"), size=n)
        
        np.random.seed(234)
        df2 = pd.DataFrame(np.random.randint(0, 10, size=(n, p)), 
                      columns=['v' + str(i) for i in range(p)])
        col_group1 = 'v' + str(p)
        col_group2 = 'v' + str(p-1)
        df2[col_group1] = np.random.choice(a=("choice1", "choice2"), size=n)
        df2[col_group2] = np.random.choice(a=("choice3", "choice4"), size=n)
        
        start = time()
        
        [qr.join(df1, df2, on=col_group1 + ", " + col_group2) for _ in range(10)]                
        
        res4[idx, idy] = time() - start
        
        qr.join.cache.clear()


np.min(res4)
np.max(res4)
np.savetxt("join.csv", res4/10, delimiter=",")

You may have noticed the instruction qr.join.cache.clear() in this code. It’s there because each querier verb has a cache, a dictionary (Python dict) that you can manipulate accordingly. Only the first function call might be time-consuming (or not!), but subsequent calls will be much, much faster.

Note: I am currently looking for a gig. You can hire me on Malt or send me an email: thierry dot moudiki at pm dot me. I can do descriptive statistics, data preparation, feature engineering, model calibration, training and validation, and model outputs’ interpretation. I am fluent in Python, R, SQL, Microsoft Excel, Visual Basic (among others) and French. My résumé? Here!

Comments powered by Talkyard.

@misc{ tmoudiki20191129, author = { T. Moudiki }, title = { Benchmarking the querier's verbs }, url = { https://thierrymoudiki.github.io/blog/2019/11/29/databases/the-querier-3 }, year = { 2019 } }