import pandas as pd
import numpy as np
[docs]def read_NDBC_file(file_name, missing_values=['MM',9999,999,99]):
"""
Reads a NDBC wave buoy data file (from https://www.ndbc.noaa.gov).
Realtime and historical data files can be loaded with this function.
Note: With realtime data, missing data is denoted by "MM". With historical
data, missing data is denoted using a variable number of
# 9's, depending on the data type (for example: 9999.0 999.0 99.0).
'N/A' is automatically converted to missing data.
Data values are converted to float/int when possible. Column names are
also converted to float/int when possible (this is useful when column
names are frequency).
Parameters
------------
file_name : string
Name of NDBC wave buoy data file
missing_value : list of values
List of values that denote missing data
Returns
---------
data: pandas DataFrame
Data indexed by datetime with columns named according to header row
metadata: dict or None
Dictionary with {column name: units} key value pairs when the NDBC file
contains unit information, otherwise None is returned
"""
assert isinstance(file_name, str), 'file_name must be of type str'
assert isinstance(missing_values, list), 'missing_values must be of type list'
# Open file and get header rows
f = open(file_name,"r")
header = f.readline().rstrip().split() # read potential headers
units = f.readline().rstrip().split() # read potential units
f.close()
# If first line is commented, remove comment sign #
if header[0].startswith("#"):
header[0] = header[0][1:]
header_commented = True
else:
header_commented = False
# If second line is commented, indicate that units exist
if units[0].startswith("#"):
units_exist = True
else:
units_exist = False
# Check if the time stamp contains minutes, and create list of column names
# to parse for date
if header[4] == 'mm':
parse_vals = header[0:5]
date_format = '%Y %m %d %H %M'
units = units[5:] #remove date columns from units
else:
parse_vals = header[0:4]
date_format = '%Y %m %d %H'
units = units[4:] #remove date columns from units
# If first line is commented, manually feed in column names
if header_commented:
data = pd.read_csv(file_name, sep='\s+', header=None, names = header,
comment = "#", parse_dates=[parse_vals])
# If first line is not commented, then the first row can be used as header
else:
data = pd.read_csv(file_name, sep='\s+', header=0,
comment = "#", parse_dates=[parse_vals])
# Convert index to datetime
date_column = "_".join(parse_vals)
data['Time'] = pd.to_datetime(data[date_column], format=date_format)
data.index = data['Time'].values
# Remove date columns
del data[date_column]
del data['Time']
# If there was a row of units, convert to dictionary
if units_exist:
metadata = {column:unit for column,unit in zip(data.columns,units)}
else:
metadata = None
# Convert columns to numeric data if possible, otherwise leave as string
for column in data:
data[column] = pd.to_numeric(data[column], errors='ignore')
# Convert column names to float if possible (handles frequency headers)
# if there is non-numeric name, just leave all as strings.
try:
data.columns = [float(column) for column in data.columns]
except:
data.columns = data.columns
# Replace indicated missing values with nan
data.replace(missing_values, np.nan, inplace=True)
return data, metadata