infravis-trahust/py/import.py

import csv, sqlite3, os, glob
from itertools import compress


BASEDIR = '../Data/'


for hus_name in next(os.walk(BASEDIR))[1]:
	print('Importing House: ' + hus_name)

	db_file = hus_name + '.db'
	try:
		print(f"Deleting and recreating '{db_file}'...")
		os.remove(db_file)
	except:
		#print('Some error occurred; Aborting DB import. Sorry! :(')
		pass

	con = sqlite3.connect(db_file)
	cur = con.cursor()

	cards = []
	all_txts = sorted(glob.glob(f"{BASEDIR+hus_name}/**/*.TXT", recursive=True))
	for txt in all_txts:

		card, year, month = os.path.split(txt)[1].split('.')[0].split('-')

		if card not in cards:
			cards.append(card)

			# Create a new table for the card
			table_name = f'Hus_{hus_name}_card_{card}'
			print(f'New card: {card}; table name: {table_name}')

			hus_dir = BASEDIR + hus_name
			with open(f'{hus_dir}/{card}-metadata.csv', encoding='utf-8') as card_file:
				csv_reader = csv.reader(card_file)
				next(csv_reader)			# first line is not important

				# Family names
				family_names = next(csv_reader)
				cur_family = family_names[3]
				# Fill the gaps
				for i in range(4,len(family_names)):
					if not family_names[i]:
						family_names[i] = cur_family
					else:
						cur_family = family_names[i]
				#print(family_names)

				# Sensor names
				sensor_names = next(csv_reader)
				#print(sensor_names)

				# Units
				units = next(csv_reader)
				for i, u in enumerate(units):
					units[i] = units[i].replace('°C', 'celsius')
					units[i] = units[i].replace('%', 'percent')
					units[i] = units[i].replace('Ω', 'ohm')
					units[i] = units[i].replace('#', 'count')
					units[i] = units[i].replace('/', 'X')
					units[i] = units[i].replace('???', 'none')
				#print(units)

				# Remove all empty columns
				not_empty = [x != 'EMPTY' for x in sensor_names]
				family_names = list(compress(family_names, not_empty))
				sensor_names = list(compress(sensor_names, not_empty))
				units = list(compress(units, not_empty))

				# Create the table
				col_names = ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']
				col_names.extend(f'{family_names[i]}.{sensor_names[i]}.{units[i]}' for i in range(3, len(family_names)))
				#print(col_names)

				cur.execute(f"CREATE TABLE {table_name}{*col_names,}")
				con.commit()

				cur.execute(f'CREATE INDEX idx_{card}_Year ON {table_name} (Year)')
				cur.execute(f'CREATE INDEX idx_{card}_Month ON {table_name} (Month)')
				cur.execute(f'CREATE INDEX idx_{card}_Day ON {table_name} (Day)')
				cur.execute(f'CREATE INDEX idx_{card}_Hour ON {table_name} (Hour)')
				con.commit()

		#exit()

		print(f'Processing {txt}...')
		with open(txt, 'r', encoding='latin-1') as fin:  # Specify the encoding here
			reader = csv.reader(fin)
			data = []
			for line in reader:
				#print(line)
				if line[0][:9] == 'Connected':
					# Skip two lines; we don't need them anymore
					header1 = next(reader)
					header2 = next(reader)
				else:
					# First split the date into three columns (y,m,d)
					row = line[0].split('-')
					# Then split the time into three columns (h,m,s)
					row.extend(line[1].split(':'))
					# Then add the data (without the date, time, and empty columns)
					row.extend(tuple(compress(line, not_empty))[2:])

					data.append(row)

					# Sometimes, some of the lines in a file are the wrong size.
					# When that happens, we skip the file.
					if len(data[-1]) != len(col_names):
						break

			# Sometimes, some of the lines in a file are the wrong size.
			# When that happens, we skip the file.
			if len(data[-1]) != len(col_names):
				print("   - Invalid file format; SKIPPING!")
				print(f"   - Expected cols: {len(col_names)}; Found cols: {len(data[-1])}")
				continue

			placeholder = f"({','.join('?' for _ in range(len(data[0])))})"

			query = f"INSERT INTO {table_name} VALUES {placeholder};"
			#print(query)
			#exit()

			cur.executemany(query, data)
			con.commit()