Refreshing Dataset cache automatically after your data loads

For your data stored on HDFS, you will typically go through the process of batch loading new data either daily or sometimes more frequently. The Arcadia catalog should be refreshed soon after the data is available so that it can be made queryable from the Arcadia UI. In addition to refreshing the table metadata in the Arcadia catalog, its also possible to refresh the result cache for any Datasets associated with a table that’s had its data refreshed or updated.

Below is a sample python script that accepts a database and table name and will refresh any Dataset associated with the table.

import sys, getopt
import requests


arcadia_url = "http://<hostname>:38888"
username = "<myusername>"
password = "<mypassword>"


class ManageSession(object):
	def __init__(self):
		self.session = requests.session()
		self.payload = None

	def set(self, url, username, password):
		response = self.session.get(url + "/arc/apps/login")
		self.session.headers['referrer'] = response.url
		self.payload = {'username': username,
						'password': password,
						'csrfmiddlewaretoken': self.session.cookies['arccsrftoken']}
		return self

def get_dataset_ids(session, base_table):
	response = session.session.get(arcadia_url + "/arc/adminapi/v1/datasets?detail=1")
	if int(response.status_code) == 200:
		all_datasets = response.json()
		dataset_ids = []
		print("[+] INFO : Looking for Datasets matching base table - %s" % str(base_table))
		for dataset in all_datasets:
			for table in dataset["info"]:
				if table["tablename"] == base_table:
					dataset_ids.append(dataset["id"])
		print("[+] INFO - Code=%s : Dataset fetch successful. Fetched %s datasets. Dataset ids: %s" % (str(response.status_code),str(len(dataset_ids)), str(dataset_ids)))
		return dataset_ids
	else:
		print("[!] ERROR - Code=%s : Datasets fetch failed." % str(response.status_code))
		sys.exit(2)

def refresh_dataset_cache(session, dataset_id):
	response = session.session.post(arcadia_url + "/arc/datasets/dataset/cache_reset/%s" % str(dataset_id), data=session.payload)
	if int(response.status_code) == 200:
		print("[+] INFO - Code=%s : Cache refreshed for Dataset %s" % (str(response.status_code),str(dataset_id)))
	else:
		print("[!] ERROR - Code=%s : Cache refresh fail for Dataset %s" % (str(response.status_code),str(dataset_id)))
		sys.exit(2)

def get_cli_args(argv):
	try:
		opts, args = getopt.getopt(argv, "-t")
		tablename = args.pop(0)
		return tablename
	except getopt.GetoptError as e:
		print("[!] ERROR : %s" % str(e))
		sys.exit(2)


if __name__ == '__main__':
	## Get base table name from cli arg
	tablename = get_cli_args(sys.argv[1:])
	## Create User Session
	session = ManageSession()
	## Set user session cookie and authenticate
	session = session.set(arcadia_url, username, password)
	session.session.post(arcadia_url + "/arc/apps/login", data=session.payload)
	## Check for Datasets related to a table
	dataset_ids = get_dataset_ids(session, tablename)
	## Set user session cookie once more
	session = session.set(arcadia_url, username, password)
	## Reset Cache for each relevant Dataset for the base table of interest
	for id in dataset_ids:
		refresh_dataset_cache(session, id)

Below is an example of how to execute this script and the expected successful response:

$ python3 refresh_cache.py spot.event

[+] INFO : Looking for Datasets matching base table - spot.event
[+] INFO - Code=200 : Dataset fetch successful. Fetched 7 datasets. Dataset ids: [264, 266, 14, 63, 380, 386, 1166]
[+] INFO - Code=200 : Cache refreshed for Dataset 264
[+] INFO - Code=200 : Cache refreshed for Dataset 266
[+] INFO - Code=200 : Cache refreshed for Dataset 14
[+] INFO - Code=200 : Cache refreshed for Dataset 63
[+] INFO - Code=200 : Cache refreshed for Dataset 380
[+] INFO - Code=200 : Cache refreshed for Dataset 386
[+] INFO - Code=200 : Cache refreshed for Dataset 1166
1 Like