From 69f0699ebd8db2232a705f61a5655f99fba96edc Mon Sep 17 00:00:00 2001 From: pca006132 Date: Thu, 27 Aug 2020 13:06:09 +0800 Subject: [PATCH] test: improved test_performance 1. Added tests for small payload. 2. Added statistics. --- artiq/test/coredevice/test_performance.py | 310 +++++++++++++++------- 1 file changed, 220 insertions(+), 90 deletions(-) diff --git a/artiq/test/coredevice/test_performance.py b/artiq/test/coredevice/test_performance.py index c9ab98b01..3ca1f86ba 100644 --- a/artiq/test/coredevice/test_performance.py +++ b/artiq/test/coredevice/test_performance.py @@ -6,138 +6,268 @@ import numpy from artiq.experiment import * from artiq.test.hardware_testbench import ExperimentCase +# large: 1MB payload +# small: 1KB payload +bytes_large = b"\x00" * (1 << 20) +bytes_small = b"\x00" * (1 << 10) + +list_large = [123] * (1 << 18) +list_small = [123] * (1 << 8) + +array_large = numpy.array(list_large, numpy.int32) +array_small = numpy.array(list_small, numpy.int32) + +byte_list_large = [True] * (1 << 20) +byte_list_small = [True] * (1 << 10) + +received_bytes = 0 +time_start = 0 +time_end = 0 class _Transfer(EnvExperiment): def build(self): self.setattr_device("core") - self.data = b"\x00"*(10**6) + self.count = 10 + self.h2d = [0.0] * self.count + self.d2h = [0.0] * self.count @rpc - def source(self) -> TBytes: - return self.data + def get_bytes(self, large: TBool) -> TBytes: + if large: + return bytes_large + else: + return bytes_small @rpc - def source_byte_list(self) -> TList(TBool): - return [True] * (1 << 15) + def get_list(self, large: TBool) -> TList(TInt32): + if large: + return list_large + else: + return list_small @rpc - def source_list(self) -> TList(TInt32): - return [123] * (1 << 15) + def get_byte_list(self, large: TBool) -> TList(TBool): + if large: + return byte_list_large + else: + return byte_list_small @rpc - def source_array(self) -> TArray(TInt32): - return numpy.array([0] * (1 << 15), numpy.int32) + def get_array(self, large: TBool) -> TArray(TInt32): + if large: + return array_large + else: + return array_small + + @rpc + def get_string_list(self) -> TList(TStr): + return string_list @rpc def sink(self, data): pass - @rpc - def sink_list(self, data): - pass + @rpc(flags={"async"}) + def sink_async(self, data): + global received_bytes, time_start, time_end + if received_bytes == 0: + time_start = time.time() + received_bytes += len(data) + if received_bytes == (1024 ** 2)*128: + time_end = time.time() @rpc - def sink_array(self, data): - pass + def get_async_throughput(self) -> TFloat: + return 128.0 / (time_end - time_start) @kernel - def host_to_device(self): - t0 = self.core.get_rtio_counter_mu() - data = self.source() - t1 = self.core.get_rtio_counter_mu() - return len(data)/self.core.mu_to_seconds(t1-t0) + def test_bytes(self, large): + def inner(): + t0 = self.core.get_rtio_counter_mu() + data = self.get_bytes(large) + t1 = self.core.get_rtio_counter_mu() + self.sink(data) + t2 = self.core.get_rtio_counter_mu() + self.h2d[i] = self.core.mu_to_seconds(t1 - t0) + self.d2h[i] = self.core.mu_to_seconds(t2 - t1) + + for i in range(self.count): + inner() + return (self.h2d, self.d2h) @kernel - def host_to_device_list(self): - t0 = self.core.get_rtio_counter_mu() - data = self.source_list() - t1 = self.core.get_rtio_counter_mu() - return 4 * len(data)/self.core.mu_to_seconds(t1-t0) + def test_byte_list(self, large): + def inner(): + t0 = self.core.get_rtio_counter_mu() + data = self.get_byte_list(large) + t1 = self.core.get_rtio_counter_mu() + self.sink(data) + t2 = self.core.get_rtio_counter_mu() + self.h2d[i] = self.core.mu_to_seconds(t1 - t0) + self.d2h[i] = self.core.mu_to_seconds(t2 - t1) + + for i in range(self.count): + inner() + return (self.h2d, self.d2h) @kernel - def host_to_device_array(self): - t0 = self.core.get_rtio_counter_mu() - data = self.source_array() - t1 = self.core.get_rtio_counter_mu() - return 4 * len(data)/self.core.mu_to_seconds(t1-t0) + def test_list(self, large): + def inner(): + t0 = self.core.get_rtio_counter_mu() + data = self.get_list(large) + t1 = self.core.get_rtio_counter_mu() + self.sink(data) + t2 = self.core.get_rtio_counter_mu() + self.h2d[i] = self.core.mu_to_seconds(t1 - t0) + self.d2h[i] = self.core.mu_to_seconds(t2 - t1) + + for i in range(self.count): + inner() + return (self.h2d, self.d2h) @kernel - def host_to_device_byte_list(self): - t0 = self.core.get_rtio_counter_mu() - data = self.source_byte_list() - t1 = self.core.get_rtio_counter_mu() - return len(data)/self.core.mu_to_seconds(t1-t0) + def test_array(self, large): + def inner(): + t0 = self.core.get_rtio_counter_mu() + data = self.get_array(large) + t1 = self.core.get_rtio_counter_mu() + self.sink(data) + t2 = self.core.get_rtio_counter_mu() + self.h2d[i] = self.core.mu_to_seconds(t1 - t0) + self.d2h[i] = self.core.mu_to_seconds(t2 - t1) + + for i in range(self.count): + inner() + return (self.h2d, self.d2h) @kernel - def device_to_host(self): - t0 = self.core.get_rtio_counter_mu() - self.sink(self.data) - t1 = self.core.get_rtio_counter_mu() - return len(self.data)/self.core.mu_to_seconds(t1-t0) - - @kernel - def device_to_host_list(self): - #data = [[0]*8 for _ in range(1 << 12)] - data = [0]*(1 << 15) - t0 = self.core.get_rtio_counter_mu() - self.sink_list(data) - t1 = self.core.get_rtio_counter_mu() - return ((len(data)*4) / - self.core.mu_to_seconds(t1-t0)) - - @kernel - def device_to_host_array(self): - data = self.source_array() - t0 = self.core.get_rtio_counter_mu() - self.sink_array(data) - t1 = self.core.get_rtio_counter_mu() - return ((len(data)*4) / - self.core.mu_to_seconds(t1-t0)) - + def test_async(self): + data = self.get_bytes(True) + for _ in range(128): + self.sink_async(data) + return self.get_async_throughput() class TransferTest(ExperimentCase): - def test_host_to_device(self): - exp = self.create(_Transfer) - host_to_device_rate = exp.host_to_device() - print(host_to_device_rate/(1024*1024), "MiB/s") - self.assertGreater(host_to_device_rate, 2.0e6) + @classmethod + def setUpClass(self): + self.results = [] - def test_host_to_device_byte_list(self): - exp = self.create(_Transfer) - host_to_device_rate = exp.host_to_device_byte_list() - print(host_to_device_rate/(1024*1024), "MiB/s") - self.assertGreater(host_to_device_rate, 2.0e6) + @classmethod + def tearDownClass(self): + if len(self.results) == 0: + return + max_length = max(max(len(row[0]) for row in self.results), len("Test")) - def test_host_to_device_list(self): - exp = self.create(_Transfer) - host_to_device_rate = exp.host_to_device_list() - print(host_to_device_rate/(1024*1024), "MiB/s") - self.assertGreater(host_to_device_rate, 2.0e6) + def pad(name): + nonlocal max_length + return name + " " * (max_length - len(name)) + print() + print("| {} | Mean (MiB/s) | std (MiB/s) |".format(pad("Test"))) + print("| {} | ------------ | ------------ |".format("-" * max_length)) + for v in self.results: + print("| {} | {:>12.2f} | {:>12.2f} |".format( + pad(v[0]), v[1], v[2])) - def test_host_to_device_array(self): + def test_bytes_large(self): exp = self.create(_Transfer) - host_to_device_rate = exp.host_to_device_array() - print(host_to_device_rate/(1024*1024), "MiB/s") - self.assertGreater(host_to_device_rate, 2.0e6) + results = exp.test_bytes(True) + host_to_device = (1 << 20) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 20) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["Bytes (1MB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["Bytes (1MB) D2H", device_to_host.mean(), + device_to_host.std()]) - def test_device_to_host(self): + def test_bytes_small(self): exp = self.create(_Transfer) - device_to_host_rate = exp.device_to_host() - print(device_to_host_rate/(1024*1024), "MiB/s") - self.assertGreater(device_to_host_rate, 2.2e6) + results = exp.test_bytes(False) + host_to_device = (1 << 10) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 10) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["Bytes (1KB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["Bytes (1KB) D2H", device_to_host.mean(), + device_to_host.std()]) - def test_device_to_host_list(self): + def test_byte_list_large(self): exp = self.create(_Transfer) - rate = exp.device_to_host_list() - print(rate/(1024*1024), "MiB/s") - self.assertGreater(rate, .15e6) + results = exp.test_byte_list(True) + host_to_device = (1 << 20) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 20) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["Bytes List (1MB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["Bytes List (1MB) D2H", device_to_host.mean(), + device_to_host.std()]) - def test_device_to_host_array(self): + def test_byte_list_small(self): exp = self.create(_Transfer) - rate = exp.device_to_host_array() - print(rate/(1024*1024), "MiB/s") - self.assertGreater(rate, .15e6) + results = exp.test_byte_list(False) + host_to_device = (1 << 10) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 10) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["Bytes List (1KB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["Bytes List (1KB) D2H", device_to_host.mean(), + device_to_host.std()]) + def test_list_large(self): + exp = self.create(_Transfer) + results = exp.test_list(True) + host_to_device = (1 << 20) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 20) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["I32 List (1MB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["I32 List (1MB) D2H", device_to_host.mean(), + device_to_host.std()]) + + def test_list_small(self): + exp = self.create(_Transfer) + results = exp.test_list(False) + host_to_device = (1 << 10) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 10) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["I32 List (1KB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["I32 List (1KB) D2H", device_to_host.mean(), + device_to_host.std()]) + + def test_array_large(self): + exp = self.create(_Transfer) + results = exp.test_array(True) + host_to_device = (1 << 20) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 20) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["I32 Array (1MB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["I32 Array (1MB) D2H", device_to_host.mean(), + device_to_host.std()]) + + def test_array_small(self): + exp = self.create(_Transfer) + results = exp.test_array(False) + host_to_device = (1 << 10) / numpy.array(results[0], numpy.float64) + device_to_host = (1 << 10) / numpy.array(results[1], numpy.float64) + host_to_device /= 1024*1024 + device_to_host /= 1024*1024 + self.results.append(["I32 Array (1KB) H2D", host_to_device.mean(), + host_to_device.std()]) + self.results.append(["I32 Array (1KB) D2H", device_to_host.mean(), + device_to_host.std()]) + + def test_async_throughput(self): + exp = self.create(_Transfer) + results = exp.test_async() + print("Async throughput: {:>6.2f}MiB/s".format(results)) class _KernelOverhead(EnvExperiment): def build(self):