Serialbox  2.2.0
Data serialization library and tools for C/C++, Python and Fortran
BinaryArchive.cpp
Go to the documentation of this file.
1 //===-- serialbox/core/archive/BinaryArchive.cpp ------------------------------------*- C++ -*-===//
2 //
3 // S E R I A L B O X
4 //
5 // This file is distributed under terms of BSD license.
6 // See LICENSE.txt for more information
7 //
8 //===------------------------------------------------------------------------------------------===//
9 //
13 //===------------------------------------------------------------------------------------------===//
14 
16 #include "serialbox/core/Logging.h"
18 #include "serialbox/core/Version.h"
20 #include <boost/algorithm/string.hpp>
21 #include <fstream>
22 
23 namespace serialbox {
24 
25 //===------------------------------------------------------------------------------------------===//
26 // BinaryBuffer
27 //===------------------------------------------------------------------------------------------===//
28 
30 class BinaryBuffer {
31 public:
33  BinaryBuffer(const StorageView& storageView) {
34  const auto& slice = storageView.getSlice();
35 
36  if(slice.empty()) {
37  buffer_.resize(storageView.sizeInBytes());
38  offset_ = 0;
39  } else {
40  const auto& dims = storageView.dims();
41  const auto& triple = slice.sliceTriples().back();
42  const int bytesPerElement = storageView.bytesPerElement();
43 
44  // Allocate a buffer which can be efficently loaded. The buffer will treat the
45  // dimensions dim_{1}, ..., dim_{N-1} as full while last the dimension dim_{N} as sliced but
46  // without incorporating the step. This is necessary as we only want to call ::write once.
47 
48  // Compute dimensions
49  dims_ = dims;
50  dims_.back() = triple.stop - triple.start;
51 
52  // Compute strides (col-major)
53  strides_.resize(dims_.size());
54 
55  int stride = 1;
56  strides_[0] = stride;
57 
58  for(int i = 1; i < dims_.size(); ++i) {
59  stride *= dims_[i - 1];
60  strides_[i] = stride;
61  }
62 
63  // Compute size
64  std::size_t size = 1;
65  for(std::size_t i = 0; i < dims_.size(); ++i)
66  size *= (dims_[i] == 0 ? 1 : dims_[i]);
67 
68  // Compute initial offset in bytes
69  offset_ = (strides_.back() * triple.start) * bytesPerElement;
70 
71  buffer_.resize(size * bytesPerElement);
72  }
73  }
74 
76  void copyBufferToStorageView(StorageView& storageView) {
77  const auto& slice = storageView.getSlice();
78 
79  if(slice.empty()) {
80  Byte* dataPtr = buffer_.data();
81  const int bytesPerElement = storageView.bytesPerElement();
82 
83  if(storageView.isMemCopyable()) {
84  std::memcpy(storageView.originPtr(), dataPtr, buffer_.size());
85  } else {
86  for(auto it = storageView.begin(), end = storageView.end(); it != end;
87  ++it, dataPtr += bytesPerElement)
88  std::memcpy(it.ptr(), dataPtr, bytesPerElement);
89  }
90 
91  } else {
92  const int numDims = dims_.size();
93  const auto& triples = slice.sliceTriples();
94  const int bytesPerElement = storageView.bytesPerElement();
95  Byte* dataPtr = buffer_.data();
96 
97  // Compute intial indices in the buffer
98  std::vector<int> index(numDims);
99  for(int i = 0; i < numDims - 1; ++i)
100  index[i] = triples[i].start;
101  index.back() = 0;
102 
103  // Iterate over the the storageView and the Buffer
104  Byte* curPtr = buffer_.data();
105  for(auto it = storageView.begin(), end = storageView.end(); it != end; ++it) {
106 
107  // Compute position of current element
108  int pos = 0;
109  for(int i = 0; i < numDims; ++i)
110  pos += bytesPerElement * (strides_[i] * index[i]);
111  curPtr = dataPtr + pos;
112 
113  // Memcopy the current elemment to the storageView
114  std::memcpy(it.ptr(), curPtr, bytesPerElement);
115 
116  // Compute the index of the next element in the buffer
117  for(int i = 0; i < numDims; ++i)
118  if((index[i] += triples[i].step) < triples[i].stop)
119  break;
120  else
121  index[i] = triples[i].start;
122  }
123  }
124  }
125 
127  void copyStorageViewToBuffer(const StorageView& storageView) {
128  Byte* dataPtr = buffer_.data();
129  const int bytesPerElement = storageView.bytesPerElement();
130 
131  if(storageView.isMemCopyable()) {
132  std::memcpy(dataPtr, storageView.originPtr(), buffer_.size());
133  } else {
134  for(auto it = storageView.begin(), end = storageView.end(); it != end;
135  ++it, dataPtr += bytesPerElement)
136  std::memcpy(dataPtr, it.ptr(), bytesPerElement);
137  }
138  }
139 
141  std::size_t size() const noexcept { return buffer_.size(); }
142 
144  Byte* data() noexcept { return buffer_.data(); }
145  const Byte* data() const noexcept { return buffer_.data(); }
146 
148  std::size_t offset() const noexcept { return offset_; }
149 
150 private:
151  std::vector<Byte> buffer_;
152 
153  std::vector<int> strides_;
154  std::vector<int> dims_;
155  std::size_t offset_;
156 };
157 
158 //===------------------------------------------------------------------------------------------===//
159 // BinaryArchive
160 //===------------------------------------------------------------------------------------------===//
161 
162 const std::string BinaryArchive::Name = "Binary";
163 
164 const int BinaryArchive::Version = 0;
165 
166 BinaryArchive::BinaryArchive(OpenModeKind mode, const std::string& directory,
167  const std::string& prefix, bool skipMetaData)
168  : mode_(mode), directory_(directory), prefix_(prefix), json_() {
169 
170  LOG(info) << "Creating BinaryArchive (mode = " << mode_ << ") from directory " << directory_;
171 
172  metaDatafile_ = directory_ / ("ArchiveMetaData-" + prefix_ + ".json");
174 
175  try {
176  bool isDir = filesystem::is_directory(directory_);
177 
178  switch(mode_) {
179  // We are reading, the directory needs to exist
180  case OpenModeKind::Read:
181  if(!isDir)
182  throw Exception("no such directory: '%s'", directory_.string());
183  break;
184  // We are writing or appending, create directories if it they don't exist
185  case OpenModeKind::Write:
186  case OpenModeKind::Append:
187  if(!isDir)
188  filesystem::create_directories(directory_);
189  break;
190  }
191  } catch(filesystem::filesystem_error& e) {
192  throw Exception(e.what());
193  }
194 
195  if(!skipMetaData)
197 
198  // Remove all files
199  if(mode_ == OpenModeKind::Write)
200  clear();
201 }
202 
204 
206  LOG(info) << "Reading MetaData for BinaryArchive ... ";
207 
208  // Check if metaData file exists
209  if(!filesystem::exists(metaDatafile_)) {
210  if(mode_ != OpenModeKind::Read)
211  return;
212  throw Exception("archive meta data not found in directory '%s'", directory_.string());
213  }
214 
215  std::ifstream fs(metaDatafile_.string(), std::ios::in);
216  fs >> json_;
217  fs.close();
218 
219  int serialboxVersion = json_["serialbox_version"];
220  std::string archiveName = json_["archive_name"];
221  int archiveVersion = json_["archive_version"];
222  std::string hashAlgorithm = json_["hash_algorithm"];
223 
224  // Check consistency
225  if(!Version::isCompatible(serialboxVersion))
226  throw Exception("serialbox version of binary archive (%s) does not match the version "
227  "of the library (%s)",
228  Version::toString(serialboxVersion), SERIALBOX_VERSION_STRING);
229 
230  if(archiveName != BinaryArchive::Name)
231  throw Exception("archive is not a binary archive");
232 
233  if(archiveVersion != BinaryArchive::Version)
234  throw Exception("binary archive version (%s) does not match the version of the library (%s)",
235  archiveVersion, BinaryArchive::Version);
236 
237  // Set the correct hash algorithm if we are not writing
238  if(mode_ != OpenModeKind::Write)
239  hash_ = HashFactory::create(hashAlgorithm);
240 
241  // Deserialize FieldTable
242  for(auto it = json_["fields_table"].begin(); it != json_["fields_table"].end(); ++it) {
243  FieldOffsetTable fieldOffsetTable;
244 
245  // Iterate over savepoint of this field
246  for(auto fileOffsetIt = it->begin(); fileOffsetIt != it->end(); ++fileOffsetIt)
247  fieldOffsetTable.push_back(FileOffsetType{fileOffsetIt->at(0), fileOffsetIt->at(1)});
248 
249  fieldTable_[it.key()] = fieldOffsetTable;
250  }
251 }
252 
254  LOG(info) << "Update MetaData of BinaryArchive";
255 
256  json_.clear();
257 
258  // Tag versions
259  json_["serialbox_version"] =
260  100 * SERIALBOX_VERSION_MAJOR + 10 * SERIALBOX_VERSION_MINOR + SERIALBOX_VERSION_PATCH;
261  json_["archive_name"] = BinaryArchive::Name;
262  json_["archive_version"] = BinaryArchive::Version;
263  json_["hash_algorithm"] = hash_->name();
264 
265  // FieldsTable
266  for(auto it = fieldTable_.begin(), end = fieldTable_.end(); it != end; ++it) {
267  for(unsigned int id = 0; id < it->second.size(); ++id)
268  json_["fields_table"][it->first].push_back({it->second[id].offset, it->second[id].checksum});
269  }
270 
271  // Write metaData to disk (just overwrite the file, we assume that there is never more than one
272  // Archive per data set and thus our in-memory copy is always the up-to-date one)
273  std::ofstream fs(metaDatafile_.string(), std::ios::out | std::ios::trunc);
274 
275  if(!fs.is_open())
276  throw Exception("cannot open file: %s", metaDatafile_);
277 
278  fs << json_.dump(2) << std::endl;
279  fs.close();
280 }
281 
283 
284 //===------------------------------------------------------------------------------------------===//
285 // Writing
286 //===------------------------------------------------------------------------------------------===//
287 
288 FieldID BinaryArchive::write(const StorageView& storageView, const std::string& field,
289  const std::shared_ptr<FieldMetainfoImpl> info) {
290  if(mode_ == OpenModeKind::Read)
291  throw Exception("Archive is not initialized with OpenModeKind set to 'Write' or 'Append'");
292 
293  LOG(info) << "Attempting to write field \"" << field << "\" to BinaryArchive ...";
294 
295  filesystem::path filename(directory_ / (prefix_ + "_" + field + ".dat"));
296  std::ofstream fs;
297 
298  // Create binary data buffer
299  BinaryBuffer binaryBuffer(storageView);
300  binaryBuffer.copyStorageViewToBuffer(storageView);
301 
302  // Compute hash
303  std::string checksum(hash_->hash(binaryBuffer.data(), binaryBuffer.size()));
304 
305  // Check if field already exists
306  auto it = fieldTable_.find(field);
307  FieldID fieldID{field, 0};
308 
309  // Field does exists
310  if(it != fieldTable_.end()) {
311  FieldOffsetTable& fieldOffsetTable = it->second;
312 
313  // Check if field has already been serialized by comparing the checksum
314  for(std::size_t i = 0; i < fieldOffsetTable.size(); ++i)
315  if(checksum == fieldOffsetTable[i].checksum) {
316  LOG(info) << "Field \"" << field << "\" already serialized (id = " << i << "). Stopping";
317  fieldID.id = i;
318  return fieldID;
319  }
320 
321  // Append field at the end
322  fs.open(filename.string(), std::ofstream::out | std::ofstream::binary | std::ofstream::app);
323 #ifdef SERIALBOX_COMPILER_MSVC
324  fs.seekp(0, fs.end);
325 #endif
326  auto offset = fs.tellp();
327  fieldID.id = fieldOffsetTable.size();
328  fieldOffsetTable.push_back(FileOffsetType{offset, checksum});
329 
330  LOG(info) << "Appending field \"" << fieldID.name << "\" (id = " << fieldID.id << ") to "
331  << filename.filename();
332  }
333  // Field does not exist, create new file and append data
334  else {
335  fs.open(filename.string(), std::ios::out | std::ios::binary | std::ios::trunc);
336  fieldID.id = 0;
337 
338  fieldTable_.insert(
339  FieldTable::value_type(fieldID.name, FieldOffsetTable(1, FileOffsetType{0, checksum})));
340 
341  LOG(info) << "Creating new file " << filename.filename() << " for field \"" << fieldID.name
342  << "\" (id = " << fieldID.id << ")";
343  }
344 
345  if(!fs.is_open())
346  throw Exception("cannot open file: '%s'", filename.string());
347 
348  // Write binaryData to disk
349  fs.write(binaryBuffer.data(), binaryBuffer.size());
350  fs.close();
351 
352  updateMetaData();
353 
354  LOG(info) << "Successfully wrote field \"" << fieldID.name << "\" (id = " << fieldID.id << ") to "
355  << filename.filename();
356  return fieldID;
357 }
358 
359 void BinaryArchive::writeToFile(std::string filename, const StorageView& storageView) {
360  // Create binary data buffer
361  BinaryBuffer binaryBuffer(storageView);
362  binaryBuffer.copyStorageViewToBuffer(storageView);
363 
364  // Write data to disk
365  std::ofstream fs(filename, std::ios::out | std::ios::binary | std::ios::trunc);
366 
367  if(!fs.is_open())
368  throw Exception("cannot open file: '%s'", filename);
369 
370  fs.write(binaryBuffer.data(), binaryBuffer.size());
371  fs.close();
372 }
373 
374 //===------------------------------------------------------------------------------------------===//
375 // Reading
376 //===------------------------------------------------------------------------------------------===//
377 
378 void BinaryArchive::read(StorageView& storageView, const FieldID& fieldID,
379  std::shared_ptr<FieldMetainfoImpl> info) const {
380  LOG(info) << "Attempting to read field \"" << fieldID.name << "\" (id = " << fieldID.id
381  << ") via BinaryArchive ... ";
382 
383  // Check if field exists
384  auto it = fieldTable_.find(fieldID.name);
385  if(it == fieldTable_.end())
386  throw Exception("no field '%s' registered in BinaryArchive", fieldID.name);
387 
388  const FieldOffsetTable& fieldOffsetTable = it->second;
389 
390  // Check if id is valid
391  if(fieldID.id >= fieldOffsetTable.size())
392  throw Exception("invalid id '%i' of field '%s'", fieldID.id, fieldID.name);
393 
394  // Create binary data buffer
395  BinaryBuffer binaryBuffer(storageView);
396 
397  // Open file & read into binary buffer
398  std::string filename((directory_ / (prefix_ + "_" + fieldID.name + ".dat")).string());
399  std::ifstream fs(filename, std::ios::binary);
400 
401  if(!fs.is_open())
402  throw Exception("cannot open file: '%s'", filename);
403 
404  // Set position in the stream
405  auto offset = fieldOffsetTable[fieldID.id].offset + binaryBuffer.offset();
406  fs.seekg(offset);
407 
408  // Read data into contiguous memory
409  fs.read(binaryBuffer.data(), binaryBuffer.size());
410  fs.close();
411 
412  binaryBuffer.copyBufferToStorageView(storageView);
413 
414  LOG(info) << "Successfully read field \"" << fieldID.name << "\" (id = " << fieldID.id << ")";
415 }
416 
417 void BinaryArchive::readFromFile(std::string filename, StorageView& storageView) {
418  filesystem::path filepath(filename);
419 
420  if(!filesystem::exists(filepath))
421  throw Exception("cannot open %s: file does not exist", filepath);
422 
423  // Create binary data buffer
424  BinaryBuffer binaryBuffer(storageView);
425 
426  std::ifstream fs(filepath.string(), std::ios::in | std::ios::binary);
427 
428  if(!fs.is_open())
429  throw Exception("cannot open file: '%s'", filename);
430 
431  // Read data into contiguous memory
432  fs.read(binaryBuffer.data(), binaryBuffer.size());
433  fs.close();
434 
435  binaryBuffer.copyBufferToStorageView(storageView);
436 }
437 
438 std::ostream& BinaryArchive::toStream(std::ostream& stream) const {
439  stream << "BinaryArchive = {\n";
440  stream << " directory: " << directory_.string() << "\n";
441  stream << " mode: " << mode_ << "\n";
442  stream << " prefix: " << prefix_ << "\n";
443  stream << " fieldsTable = {\n";
444  for(auto it = fieldTable_.begin(), end = fieldTable_.end(); it != end; ++it) {
445  stream << " " << it->first << " = {\n";
446  for(std::size_t id = 0; id < it->second.size(); ++id)
447  stream << " [ " << it->second[id].offset << ", " << it->second[id].checksum << " ]\n";
448  stream << " }\n";
449  }
450  stream << " }\n";
451  stream << "}\n";
452  return stream;
453 }
454 
456  filesystem::directory_iterator end;
457  for(filesystem::directory_iterator it(directory_); it != end; ++it) {
458  if(filesystem::is_regular_file(it->path()) &&
459  boost::algorithm::starts_with(it->path().filename().string(), prefix_ + "_") &&
460  filesystem::path(it->path()).extension() == ".dat") {
461 
462  if(!filesystem::remove(it->path()))
463  LOG(warning) << "BinaryArchive: cannot remove file " << it->path();
464  }
465  }
466  clearFieldTable();
467 }
468 
470  fieldTable_.clear();
471  json_.clear();
472 }
473 
474 std::unique_ptr<Archive> BinaryArchive::create(OpenModeKind mode, const std::string& directory,
475  const std::string& prefix) {
476  return std::make_unique<BinaryArchive>(mode, directory, prefix, false);
477 }
478 
479 } // namespace serialbox
BinaryBuffer(const StorageView &storageView)
Allocate the buffer.
Byte * originPtr() noexcept
Get raw data pointer.
Definition: StorageView.h:86
static std::string defaultHash()
Get the default hash algorithm (currently MD5 if avialable, SHA256 otherwise)
Definition: HashFactory.cpp:46
virtual void clear() override
Clear the archive i.e remove all data from disk and reset the internal data-structures.
static void writeToFile(std::string filename, const StorageView &storageView)
Directly write field (given by storageView) to file.
static std::string toString(int version)
Convert to string.
Definition: Version.h:38
virtual std::ostream & toStream(std::ostream &stream) const override
Convert the archive to stream.
static const std::string Name
Name of the binary archive.
Definition: BinaryArchive.h:35
std::size_t sizeInBytes() const noexcept
Size of the allocated, sliced data (without padding) in Bytes.
unsigned int id
ID within the field.
Definition: FieldID.h:29
static void readFromFile(std::string filename, StorageView &storageView)
Directly read field (given by storageView) from file.
static const int Version
Revision of the binary archive.
Definition: BinaryArchive.h:38
void copyStorageViewToBuffer(const StorageView &storageView)
Copy data from storageView to buffer.
virtual std::string directory() const override
Directory to write/read files.
Definition: BinaryArchive.h:98
std::vector< FileOffsetType > FieldOffsetTable
Table of ids and corresponding offsets whithin in each field (i.e file)
Definition: BinaryArchive.h:47
#define LOG(severity)
Logging infrastructure.
Definition: Logging.h:102
Represent a mutable view to a multi-dimensional storage.
Definition: StorageView.h:33
static std::unique_ptr< Hash > create(const std::string &name)
Construct an instance of the Hash name
Definition: HashFactory.cpp:26
Namespace of the serialbox library.
Definition: Archive.h:25
char Byte
Represent a byte i.e sizeof(Byte) == 1.
Definition: Type.h:35
std::size_t offset() const noexcept
Get initial offset of the data on disk in bytes.
virtual void read(StorageView &storageView, const FieldID &fieldID, std::shared_ptr< FieldMetainfoImpl > info) const override
Read the field identified by fieldID and given by storageView from disk.
Uniquely identifiy a field.
Definition: FieldID.h:27
std::string name
Name of the field.
Definition: FieldID.h:28
const std::vector< int > & dims() const noexcept
Get dimensions.
Definition: StorageView.h:96
virtual std::string prefix() const override
Prefix of all files.
virtual void updateMetaData() override
Update the meta-data on disk.
virtual OpenModeKind mode() const override
Open-policy of the archive.
Definition: BinaryArchive.h:96
void readMetaDataFromJson()
Load meta-data from JSON file.
void copyBufferToStorageView(StorageView &storageView)
Copy data from buffer to storageView while handling slicing.
void writeMetaDataToJson()
Convert meta-data to JSON and serialize to file.
void clearFieldTable()
Clear fieldTable.
std::vector< SliceTriple > & sliceTriples() noexcept
Get slice triples.
Definition: Slice.h:106
Slice & getSlice() noexcept
Get the slice of the StorageView
Definition: StorageView.h:129
bool isMemCopyable() const noexcept
Return true if the storage is contiguous in memory (i.e no padding) and is column-major ordered...
Definition: StorageView.cpp:91
StorageViewIterator end() noexcept
Iterator to the end of the data.
Definition: StorageView.h:64
OpenModeKind
Policy for opening files in the Serializer and Archive.
Definition: Type.h:40
bool empty() const noexcept
Check if slice is empty.
Definition: Slice.h:100
StorageViewIterator begin() noexcept
Iterator to the beginning of the data.
Definition: StorageView.h:56
static bool isCompatible(int version) noexcept
Check if the given version is compatible with the current library version (i.e. is older) ...
Definition: Version.h:61
serialbox::Slice slice
Specification of the slice indices which is used for partial loading of serialized data...
Definition: Slice.h:43
static std::unique_ptr< Archive > create(OpenModeKind mode, const std::string &directory, const std::string &prefix)
Create a BinaryArchive.
Contiguous buffer with support for sliced loading.
Byte * data() noexcept
Get pointer to the beginning of the buffer.
virtual FieldID write(const StorageView &storageView, const std::string &fieldID, const std::shared_ptr< FieldMetainfoImpl > info) override
Write the field given by storageView to disk.
Exception class which stores a human-readable error description.
Definition: Exception.h:30
std::size_t size() const noexcept
Get Buffer size.
int bytesPerElement() const noexcept
Get bytes per element.
Definition: StorageView.h:93
virtual ~BinaryArchive()
Destructor.