Skip to content

Previewing precursor and merged Parquets

Use DuckDB to peek at the merged pixel extraction table alongside its precursor Parquet files. Update fl_dir to the flightline of interest.

import duckdb, glob, os

fl_dir = "/path/to/NEON_D13_NIWO_DP1_L020-1_20230815_directional_reflectance"

merged = os.path.join(
    fl_dir,
    "NEON_D13_NIWO_DP1_L020-1_20230815_directional_reflectance_merged_pixel_extraction.parquet",
)
precursors = sorted(glob.glob(os.path.join(fl_dir, "*_envi.parquet"))) + \
             sorted(glob.glob(os.path.join(fl_dir, "*_brdfandtopo_corrected_envi.parquet"))) + \
             sorted(glob.glob(os.path.join(fl_dir, "*_landsat_*_envi.parquet"))) + \
             sorted(glob.glob(os.path.join(fl_dir, "*_micasense*_envi.parquet")))

con = duckdb.connect()
# merged head
con.execute(f"SELECT * FROM read_parquet('{merged}') LIMIT 5").df()

# peek one precursor from each stage/sensor
for p in precursors:
    print("→", os.path.basename(p))
    df = con.execute(f"SELECT * FROM read_parquet('{p}') LIMIT 5").df()
    display(df.head())

# quick schema check: verify spectral names and order
schema_check = con.execute(
    """
    WITH cols AS (
      SELECT column_name
      FROM parquet_schema($path)
    )
    SELECT *
    FROM cols
    WHERE column_name ~ '.*_b[0-9]{3}_wl[0-9]{4}nm$'
    LIMIT 10
    """,
    {'path': precursors[0]},
).df()
schema_check