#!/usr/bin/env python3

import datetime import io import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import matplotlib.pyplot as plt

n_columns = 10 n_records = 1_000_000 # n_records = 1000 data = np.random.randint(0, 1000, [n_records, n_columns])

csv_output = io.StringIO() before = datetime.datetime.now() np.savetxt(csv_output, data, '%d', ',') csv_generate_elapsed_time = datetime.datetime.now() - before csv = csv_output.getvalue() print(f'Generate CSV: {csv_generate_elapsed_time.total_seconds()}')

csv_input = io.StringIO(csv) before = datetime.datetime.now() np.loadtxt(csv_input, int, delimiter=',') csv_load_elapsed_time = datetime.datetime.now() - before print(f'Load CSV: {csv_load_elapsed_time.total_seconds()}')

npy_output = io.BytesIO() before = datetime.datetime.now() np.save(npy_output, data) npy_generate_elapsed_time = datetime.datetime.now() - before npy = npy_output.getvalue() print(f'Generate NumPy: {npy_generate_elapsed_time.total_seconds()}')

npy_input = io.BytesIO(npy) before = datetime.datetime.now() np.load(npy_input) npy_load_elapsed_time = datetime.datetime.now() - before print(f'Load NumPy: {npy_load_elapsed_time.total_seconds()}')

before = datetime.datetime.now() df = pd.DataFrame(data) table = pa.Table.from_pandas(df) parquet_output = pa.BufferOutputStream() writer = pq.ParquetWriter(parquet_output, table.schema) writer.write_table(table) writer.close() parquet = parquet_output.getvalue() parquet_generate_elapsed_time = datetime.datetime.now() - before print(f'Generate Apache Parquet: {parquet_generate_elapsed_time.total_seconds()}')

before = datetime.datetime.now() parquet_input = pa.BufferReader(parquet) reader = pq.ParquetFile(parquet_input) table = reader.read() df = table.to_pandas() df.to_numpy() parquet_load_elapsed_time = datetime.datetime.now() - before print(f'Load Apache Parquet: {parquet_load_elapsed_time.total_seconds()}')

before = datetime.datetime.now() df = pd.DataFrame(data) table = pa.Table.from_pandas(df) arrow_output = pa.BufferOutputStream() writer = pa.ipc.new_file(arrow_output, table.schema) writer.write_table(table) writer.close() arrow = arrow_output.getvalue() arrow_generate_elapsed_time = datetime.datetime.now() - before print(f'Generate Apache Arrow: {arrow_generate_elapsed_time.total_seconds()}')

before = datetime.datetime.now() arrow_input = pa.BufferReader(arrow) reader = pa.ipc.open_file(arrow_input) df = reader.read_pandas() df.to_numpy() arrow_load_elapsed_time = datetime.datetime.now() - before print(f'Load Apache Arrow: {arrow_load_elapsed_time.total_seconds()}')

labels = ['Generate', 'Load'] csv_elapsed_times = [

csv_generate_elapsed_time.total_seconds(),
csv_load_elapsed_time.total_seconds(),

] npy_elapsed_times = [

npy_generate_elapsed_time.total_seconds(),
npy_load_elapsed_time.total_seconds(),

] parquet_elapsed_times = [

parquet_generate_elapsed_time.total_seconds(),
parquet_load_elapsed_time.total_seconds(),

] arrow_elapsed_times = [

arrow_generate_elapsed_time.total_seconds(),
arrow_load_elapsed_time.total_seconds(),

]

y = np.arange(len(labels)) width = 0.35

competities = [

['csv', 'CSV', csv_elapsed_times],
['numpy', 'NumPy', npy_elapsed_times],
['apache-parquet', 'Apache Parquet', parquet_elapsed_times],

] for id, label, elapsed_times in competities:

fig, ax = plt.subplots()
ax.barh(y - width / 2, elapsed_times, width, label=label)
ax.barh(y + width / 2, arrow_elapsed_times, width, label='Apache Arrow')

ax.set_xlabel(f'Elapsed time (second) ({n_records} records) (Shorter is faster)')
ax.set_title('Apache Arrow improves data interchange performance')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.invert_yaxis()
ax.legend()

fig.tight_layout()

fig.savefig(f'images/benchmark-data-interchange-apache-arrow-{id}.svg')