mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
* Starting to spawn base package * More package hacking * Bedrock and VertexAI * Parquet split * Updated templates * Utils
45 lines
746 B
Python
Executable file
45 lines
746 B
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Concatenates multiple parquet files into a single parquet output
|
|
"""
|
|
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
import pandas as pd
|
|
import sys
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog="combine-parquet",
|
|
description=__doc__
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-i', '--input',
|
|
nargs='*',
|
|
help=f'Input files'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-o', '--output',
|
|
help=f'Output files'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
df = None
|
|
|
|
for file in args.input:
|
|
|
|
part = pq.read_table(file).to_pandas()
|
|
|
|
if df is None:
|
|
df = part
|
|
else:
|
|
df = pd.concat([df, part], ignore_index=True)
|
|
|
|
if df is not None:
|
|
|
|
table = pa.Table.from_pandas(df)
|
|
pq.write_table(table, args.output)
|