Spaces:

SyamNaren
/

medicalGpt

Runtime error

App Files Files Community

medicalGpt / myenv /Lib /site-packages /fsspec /implementations /tests /test_dbfs.py

SyamNaren

upload env

63deadc verified over 1 year ago

raw

history blame contribute delete

9.23 kB

	"""
	Test-Cases for the DataBricks Filesystem.
	This test case is somewhat special, as there is no "mock" databricks
	API available. We use the [vcr(https://github.com/kevin1024/vcrpy)
	package to record the requests and responses to the real databricks API and
	replay them on tests.

	This however means, that when you change the tests (or when the API
	itself changes, which is very unlikely to occur as it is versioned),
	you need to re-record the answers. This can be done as follows:

	1. Delete all casettes files in the "./cassettes/test_dbfs" folder
	2. Spin up a databricks cluster. For example,
	you can use an Azure Databricks instance for this.
	3. Take note of the instance details (the instance URL. For example for an Azure
	databricks cluster, this has the form
	adb-<some-number>.<two digits>.azuredatabricks.net)
	and your personal token (Find out more here:
	https://docs.databricks.com/dev-tools/api/latest/authentication.html)
	4. Set the two environment variables `DBFS_INSTANCE` and `DBFS_TOKEN`
	5. Now execute the tests as normal. The results of the API calls will be recorded.
	6. Unset the environment variables and replay the tests.
	"""

	import os
	import sys
	from urllib.parse import urlparse

	import numpy
	import pytest

	import fsspec

	if sys.version_info >= (3, 10):
	pytest.skip("These tests need to be re-recorded.", allow_module_level=True)

	DUMMY_INSTANCE = "my_instance.com"
	INSTANCE = os.getenv("DBFS_INSTANCE", DUMMY_INSTANCE)
	TOKEN = os.getenv("DBFS_TOKEN", "")


	@pytest.fixture(scope="module")
	def vcr_config():
	"""
	To not record information in the instance and token details
	(which are sensitive), we delete them from both the
	request and the response before storing it.
	We also delete the date as it is likely to change
	(and will make git diffs harder).
	If the DBFS_TOKEN env variable is set, we record with VCR.
	If not, we only replay (to not accidentally record with a wrong URL).
	"""

	def before_record_response(response):
	try:
	del response["headers"]["x-databricks-org-id"]
	del response["headers"]["date"]
	except KeyError:
	pass
	return response

	def before_record_request(request):
	# Replace the instance URL
	uri = urlparse(request.uri)
	uri = uri._replace(netloc=DUMMY_INSTANCE)
	request.uri = uri.geturl()

	return request

	if TOKEN:
	return {
	"record_mode": "once",
	"filter_headers": [("authorization", "DUMMY")],
	"before_record_response": before_record_response,
	"before_record_request": before_record_request,
	}
	else:
	return {
	"record_mode": "none",
	}


	@pytest.fixture
	def dbfsFS():
	fs = fsspec.filesystem("dbfs", instance=INSTANCE, token=TOKEN)

	return fs


	@pytest.fixture
	def make_mock_diabetes_ds():
	pa = pytest.importorskip("pyarrow")

	names = [
	"Pregnancies",
	"Glucose",
	"BloodPressure",
	"SkinThickness",
	"Insulin",
	"BMI",
	"DiabetesPedigreeFunction",
	"Age",
	"Outcome",
	]
	pregnancies = pa.array(numpy.random.randint(low=0, high=17, size=25))
	glucose = pa.array(numpy.random.randint(low=0, high=199, size=25))
	blood_pressure = pa.array(numpy.random.randint(low=0, high=122, size=25))
	skin_thickness = pa.array(numpy.random.randint(low=0, high=99, size=25))
	insulin = pa.array(numpy.random.randint(low=0, high=846, size=25))
	bmi = pa.array(numpy.random.uniform(0.0, 67.1, size=25))
	diabetes_pedigree_function = pa.array(numpy.random.uniform(0.08, 2.42, size=25))
	age = pa.array(numpy.random.randint(low=21, high=81, size=25))
	outcome = pa.array(numpy.random.randint(low=0, high=1, size=25))

	return pa.Table.from_arrays(
	arrays=[
	pregnancies,
	glucose,
	blood_pressure,
	skin_thickness,
	insulin,
	bmi,
	diabetes_pedigree_function,
	age,
	outcome,
	],
	names=names,
	)


	@pytest.mark.vcr()
	def test_dbfs_file_listing(dbfsFS):
	assert "/FileStore" in dbfsFS.ls("/", detail=False)
	assert {"name": "/FileStore", "size": 0, "type": "directory"} in dbfsFS.ls(
	"/", detail=True
	)


	@pytest.mark.vcr()
	def test_dbfs_mkdir(dbfsFS):
	dbfsFS.rm("/FileStore/my", recursive=True)
	assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)

	dbfsFS.mkdir("/FileStore/my/dir", create_parents=True)

	assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)
	assert "/FileStore/my/dir" in dbfsFS.ls("/FileStore/my/", detail=False)

	with pytest.raises(FileExistsError):
	dbfsFS.mkdir("/FileStore/my/dir", create_parents=True, exist_ok=False)

	with pytest.raises(OSError):
	dbfsFS.rm("/FileStore/my", recursive=False)

	assert "/FileStore/my" in dbfsFS.ls("/FileStore/", detail=False)

	dbfsFS.rm("/FileStore/my", recursive=True)
	assert "/FileStore/my" not in dbfsFS.ls("/FileStore/", detail=False)


	@pytest.mark.vcr()
	def test_dbfs_write_and_read(dbfsFS):
	dbfsFS.rm("/FileStore/file.csv")
	assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)

	content = b"This is a test\n" * 100000 + b"For this is the end\n"

	with dbfsFS.open("/FileStore/file.csv", "wb") as f:
	f.write(content)

	assert "/FileStore/file.csv" in dbfsFS.ls("/FileStore", detail=False)

	with dbfsFS.open("/FileStore/file.csv", "rb") as f:
	data = f.read()
	assert data == content
	dbfsFS.rm("/FileStore/file.csv")
	assert "/FileStore/file.csv" not in dbfsFS.ls("/FileStore/", detail=False)


	@pytest.mark.vcr()
	def test_dbfs_read_range(dbfsFS):
	dbfsFS.rm("/FileStore/file.txt")
	assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
	content = b"This is a test\n"
	with dbfsFS.open("/FileStore/file.txt", "wb") as f:
	f.write(content)
	assert "/FileStore/file.txt" in dbfsFS.ls("/FileStore", detail=False)
	assert dbfsFS.cat_file("/FileStore/file.txt", start=8, end=14) == content[8:14]
	dbfsFS.rm("/FileStore/file.txt")
	assert "/FileStore/file.txt" not in dbfsFS.ls("/FileStore/", detail=False)


	@pytest.mark.vcr()
	def test_dbfs_read_range_chunked(dbfsFS):
	dbfsFS.rm("/FileStore/large_file.txt")
	assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)
	content = b"This is a test\n" * (1 * 2**18) + b"For this is the end\n"
	with dbfsFS.open("/FileStore/large_file.txt", "wb") as f:
	f.write(content)
	assert "/FileStore/large_file.txt" in dbfsFS.ls("/FileStore", detail=False)
	assert dbfsFS.cat_file("/FileStore/large_file.txt", start=8) == content[8:]
	dbfsFS.rm("/FileStore/large_file.txt")
	assert "/FileStore/large_file.txt" not in dbfsFS.ls("/FileStore/", detail=False)


	@pytest.mark.vcr()
	def test_dbfs_write_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
	pytest.importorskip("pyarrow.dataset")
	pq = pytest.importorskip("pyarrow.parquet")

	dbfsFS.rm("/FileStore/pyarrow", recursive=True)
	assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)

	pq.write_to_dataset(
	make_mock_diabetes_ds,
	filesystem=dbfsFS,
	compression="none",
	existing_data_behavior="error",
	root_path="/FileStore/pyarrow/diabetes",
	use_threads=False,
	)

	assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
	assert (
	"/FileStore/pyarrow/diabetes"
	in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
	and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
	)

	dbfsFS.rm("/FileStore/pyarrow", recursive=True)
	assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)


	@pytest.mark.vcr()
	def test_dbfs_read_pyarrow_non_partitioned(dbfsFS, make_mock_diabetes_ds):
	ds = pytest.importorskip("pyarrow.dataset")
	pq = pytest.importorskip("pyarrow.parquet")

	dbfsFS.rm("/FileStore/pyarrow", recursive=True)
	assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)

	pq.write_to_dataset(
	make_mock_diabetes_ds,
	filesystem=dbfsFS,
	compression="none",
	existing_data_behavior="error",
	root_path="/FileStore/pyarrow/diabetes",
	use_threads=False,
	)

	assert len(dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)) == 1
	assert (
	"/FileStore/pyarrow/diabetes"
	in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
	and ".parquet" in dbfsFS.ls("/FileStore/pyarrow/diabetes", detail=False)[0]
	)

	arr_res = ds.dataset(
	source="/FileStore/pyarrow/diabetes",
	filesystem=dbfsFS,
	).to_table()

	assert arr_res.num_rows == make_mock_diabetes_ds.num_rows
	assert arr_res.num_columns == make_mock_diabetes_ds.num_columns
	assert set(arr_res.schema).difference(set(make_mock_diabetes_ds.schema)) == set()

	dbfsFS.rm("/FileStore/pyarrow", recursive=True)
	assert "/FileStore/pyarrow" not in dbfsFS.ls("/FileStore/", detail=False)