Use parametric sql functions to insert new files.

Refactoring so that there are no extra copies of data being inserted.
2025-08-07 13:29:49 +00:00 · 2015-12-29 17:32:22 +00:00 · 2015-12-29 17:32:22 +00:00 · 390b69e150
commit 390b69e150
parent a91e75829f
7 changed files with 360 additions and 147 deletions
--- a/src/pq/connection.cpp
+++ b/src/pq/connection.cpp
@ -24,8 +24,82 @@
 #include <memory>
 #include <boost/lexical_cast.hpp>
 #include <sstream>
+#include <cstring>
+#include "libpqtypes.h"
+#include <cstdlib>
+#include <ctime>
+#include <cassert>
+
+using sc = std::chrono::system_clock;

 namespace pq {
+	namespace implem {
+		template <> const char* type_to_pqtypes_name<std::string>() { return "%text"; }
+		template <> const char* type_to_pqtypes_name<boost::string_ref>() { return "%text"; }
+		template <> const char* type_to_pqtypes_name<bool>() { return "%bool"; }
+		template <> const char* type_to_pqtypes_name<float>() { return "%float4"; }
+		template <> const char* type_to_pqtypes_name<double>() { return "%float8"; }
+		template <> const char* type_to_pqtypes_name<int16_t>() { return "%int2"; }
+		template <> const char* type_to_pqtypes_name<int32_t>() { return "%int4"; }
+		template <> const char* type_to_pqtypes_name<int64_t>() { return "%int8"; }
+		template <> const char* type_to_pqtypes_name<uint16_t>() { return "%int2"; }
+		template <> const char* type_to_pqtypes_name<uint32_t>() { return "%int4"; }
+		template <> const char* type_to_pqtypes_name<uint64_t>() { return "%int8"; }
+		template <> const char* type_to_pqtypes_name<sc::time_point>() { return "%timestamptz"; }
+
+		template const char* type_to_pqtypes_name<std::string> ( void );
+		template const char* type_to_pqtypes_name<boost::string_ref> ( void );
+		template const char* type_to_pqtypes_name<bool> ( void );
+		template const char* type_to_pqtypes_name<float> ( void );
+		template const char* type_to_pqtypes_name<double> ( void );
+		template const char* type_to_pqtypes_name<int16_t> ( void );
+		template const char* type_to_pqtypes_name<int32_t> ( void );
+		template const char* type_to_pqtypes_name<int64_t> ( void );
+		template const char* type_to_pqtypes_name<uint16_t> ( void );
+		template const char* type_to_pqtypes_name<uint32_t> ( void );
+		template const char* type_to_pqtypes_name<uint64_t> ( void );
+
+		auto get_pqlib_c_type_struct<std::chrono::system_clock::time_point>::conv (const std::chrono::system_clock::time_point& parParam) -> type {
+			static_assert(sizeof(storage) == sizeof(PGtimestamp), "Wrong size for timestamp, please update DATA_SIZE");
+			static_assert(alignof(storage) == alignof(PGtimestamp), "Wrong alignment for timestamp, please update type");
+
+			using std::chrono::system_clock;
+
+			PGtimestamp ts;
+
+			std::memset(&ts, 0, sizeof(PGtimestamp));
+
+			auto t = system_clock::to_time_t(parParam);
+			ts.epoch = t;
+			auto tm = std::localtime(&t);
+			ts.time.hour = tm->tm_hour;
+			ts.time.min = tm->tm_min;
+			ts.time.sec = tm->tm_sec;
+			ts.time.usec = 0;
+			ts.time.withtz = 1;
+			ts.date.isbc = 0;
+			ts.date.year = tm->tm_year + 1900;
+			ts.date.mon = tm->tm_mon;
+			ts.date.mday = tm->tm_mday;
+			char* tzn;
+			PQlocalTZInfo(&t, &ts.time.gmtoff, &ts.time.isdst, &tzn);
+			std::strcpy(ts.time.tzabbr, tzn);
+
+			std::copy(reinterpret_cast<const char*>(&ts), reinterpret_cast<const char*>(&ts) + sizeof(ts), reinterpret_cast<char*>(&m_storage));
+			return &m_storage;
+		}
+
+		get_pqlib_c_type_struct<std::chrono::system_clock::time_point>::~get_pqlib_c_type_struct ( void ) noexcept {
+			return;
+		}
+	} //namespace implem
+
+	namespace {
+		int call_PQputf (PGparam* parParam, const std::string* parTypes, va_list parArgp) {
+			return PQputvf(parParam, nullptr, 0, parTypes->c_str(), parArgp);
+		}
+	} //unnamed namespace
+
 	struct Connection::LocalData {
 		PGconn* connection;
 	};
@ -81,10 +155,13 @@ namespace pq {
 			throw DatabaseException(oss.str(), std::move(err), __FILE__, __LINE__);
 		}
 		query_void("SET NAMES 'utf8'");
+
+		PQinitTypes(m_localData->connection); //Init libpqtypes
 	}

 	void Connection::disconnect() {
 		if (is_connected()) {
+			PQclearTypes(m_localData->connection); //clear libpqtypes
 			PQfinish(m_localData->connection);
 			m_localData->connection = nullptr;
 		}
@ -134,4 +211,41 @@ namespace pq {
 		PQArrayType clean_str(PQescapeLiteral(m_localData->connection, parString.data(), parString.size()), &PQfreemem);
 		return std::string(clean_str.get());
 	}
+
+	void Connection::query_void_params (const std::string& parQuery, PGParams& parParams) {
+		auto deleter = [](PGresult* r) { PQclear(r); };
+		using ResultPtr = std::unique_ptr<PGresult, decltype(deleter)>;
+
+		int result_format = 1;
+		assert(parParams.get());
+		auto res = ResultPtr(
+			PQparamExec(
+				m_localData->connection,
+				parParams.get(),
+				parQuery.c_str(),
+				result_format
+			),
+			deleter
+		);
+		if (not res) {
+			std::ostringstream oss;
+			oss << "Error allocating result object while running \"" << parQuery << "\": " << PQgeterror();
+			throw DatabaseException("Error running query", oss.str(), __FILE__, __LINE__);
+		}
+		const int ress = PQresultStatus(res.get());
+		if (ress != PGRES_TUPLES_OK && ress != PGRES_COMMAND_OK) {
+			throw DatabaseException("Error running query", error_message(), __FILE__, __LINE__);
+		}
+	}
+
+	auto Connection::make_params (const std::string* parTypes, ...) -> PGParams {
+		PGParams retval(PQparamCreate(m_localData->connection), &PQparamClear);
+		va_list argp;
+
+		va_start(argp, parTypes);
+		call_PQputf(retval.get(), parTypes, argp);
+		va_end(argp);
+
+		return std::move(retval);
+	}
 } //namespace pq
--- a/src/pq/connection.hpp
+++ b/src/pq/connection.hpp
@ -23,6 +23,11 @@
 #include <cstdint>
 #include <memory>
 #include <boost/utility/string_ref.hpp>
+#include <chrono>
+#include <type_traits>
+
+struct pg_param;
+typedef pg_param PGparam;

 namespace pq {
 	class Connection {
@ -40,8 +45,15 @@ namespace pq {
 		std::string escaped_literal ( const std::string& parString );
 		std::string escaped_literal ( boost::string_ref parString );

+		template <typename... Args>
+		void query_void ( const std::string& parQuery, Args&&... parArgs );
+
 	private:
 		struct LocalData;
+		using PGParams = std::unique_ptr<::PGparam, void(*)(::PGparam*)>;
+
+		void query_void_params ( const std::string& parQuery, PGParams& parParams );
+		PGParams make_params ( const std::string* parTypes, ... );

 		const std::string m_username;
 		const std::string m_passwd;
@ -51,6 +63,71 @@ namespace pq {
 		std::unique_ptr<LocalData> m_localData;
 	};

+	namespace implem {
+		template <typename T>
+		const char* type_to_pqtypes_name ( void );
+
+		template <typename T>
+		struct get_pqlib_c_type_struct {
+			using type = T;
+			static type conv ( T parParam ) { return parParam; }
+		};
+		template <>
+		struct get_pqlib_c_type_struct<std::string> {
+			using type = const char*;
+			static type conv ( const std::string& parParam ) { return parParam.c_str(); }
+		};
+		template <>
+		struct get_pqlib_c_type_struct<boost::string_ref> {
+			using type = const char*;
+			static type conv ( const boost::string_ref& parParam ) { return parParam.data(); }
+		};
+		template <>
+		struct get_pqlib_c_type_struct<bool> {
+			using type = int;
+			static type conv ( bool parParam ) { return (parParam ? 1 : 0); }
+		};
+		template <>
+		struct get_pqlib_c_type_struct<std::chrono::system_clock::time_point> {
+			struct StorageStruct { uint64_t epoch; int a[14]; char tzabbr[16]; };
+			static constexpr std::size_t DATA_SIZE = sizeof(StorageStruct);
+			using storage = std::aligned_storage<DATA_SIZE, alignof(uint64_t)>::type;
+			storage m_storage;
+
+		public:
+			using type = const storage*;
+
+			type conv ( const std::chrono::system_clock::time_point& parParam );
+			~get_pqlib_c_type_struct ( void ) noexcept;
+		};
+
+		template <typename T>
+		inline typename get_pqlib_c_type_struct<T>::type get_pqlib_c_type (const T& parParam) {
+			return get_pqlib_c_type_struct<T>::conv(parParam);
+		}
+	} //namespace implem
+
+	template <typename... Args>
+	void Connection::query_void (const std::string& parQuery, Args&&... parArgs) {
+		using std::remove_cv;
+		using std::remove_reference;
+
+		auto make_pgparams = [&parArgs..., this](){
+			using implem::type_to_pqtypes_name;
+
+			std::string types;
+			int unpack[] {0, (types += type_to_pqtypes_name<typename remove_cv<typename remove_reference<Args>::type>::type>(), types += ' ', 0)...};
+			if (not types.empty()) {
+				types.resize(types.size() - 1);
+			}
+			static_cast<void>(unpack);
+
+			return this->make_params(&types, implem::get_pqlib_c_type_struct<typename remove_cv<typename remove_reference<Args>::type>::type>().conv(parArgs)...);
+		};
+		PGParams pgparams = make_pgparams();
+
+		this->query_void_params(parQuery, pgparams);
+	}
 } //namespace pq

 #endif
--- a/src/scan/dbbackend.cpp
+++ b/src/scan/dbbackend.cpp
@ -18,6 +18,7 @@
 #include "dbbackend.hpp"
 #include "pq/connection.hpp"
 #include "dindexer-common/settings.hpp"
+#include "recorddata.hpp"
 #include <string>
 #include <sstream>
 #include <utility>
@ -25,11 +26,10 @@
 #include <exception>
 #include <memory>
 #include <boost/utility/string_ref.hpp>
+#include <chrono>

 namespace din {
 	namespace {
-		const std::size_t g_batch_size = 100;
-
 		std::string make_set_insert_query (pq::Connection& parConn, const SetRecordData& parSetData) {
 			std::ostringstream oss;
 			oss << "INSERT INTO \"sets\" (\"desc\",\"type\") VALUES ("
@ -38,15 +38,9 @@ namespace din {
 				<< ");";
 			return oss.str();
 		}
-
-		boost::string_ref time_to_str (const std::time_t parTime, char* parBuff, std::size_t parLength) {
-			const auto gtm = std::gmtime(&parTime);
-			const auto len = std::strftime(parBuff, parLength, "%F %T%z", gtm);
-			return boost::string_ref(parBuff, len);
-		}
 	} //unnamed namespace

-	bool read_from_db (FileRecordData& parItem, SetRecordDataFull& parSet, const dinlib::SettingsDB& parDB, std::string&& parHash) {
+	bool read_from_db (FileRecordData& parItem, SetRecordDataFull& parSet, const dinlib::SettingsDB& parDB, const TigerHash& parHash) {
 		using boost::lexical_cast;

 		pq::Connection conn(std::string(parDB.username), std::string(parDB.password), std::string(parDB.dbname), std::string(parDB.address), parDB.port);
@ -55,8 +49,8 @@ namespace din {
 		uint32_t group_id;
 		{
 			std::ostringstream oss;
-			oss << "SELECT path,level,group_id,is_directory,is_symlink,size FROM files WHERE hash=" <<
-				conn.escaped_literal(parHash) <<
+			oss << "SELECT path,level,group_id,is_directory,is_symlink,size FROM files WHERE hash='" <<
+				tiger_to_string(parHash, true) << "'" <<
 				" LIMIT 1;";

 			auto resultset = conn.query(oss.str());
@ -66,7 +60,7 @@ namespace din {

 			auto row = resultset[0];
 			parItem.path = row["path"];
-			parItem.hash = std::move(parHash);
+			parItem.hash = parHash;
 			parItem.level = lexical_cast<uint16_t>(row["level"]);
 			parItem.size = lexical_cast<uint64_t>(row["size"]);
 			parItem.is_directory = (row["is_directory"] == "t" ? true : false);
@ -94,47 +88,36 @@ namespace din {
 	}

 	void write_to_db (const dinlib::SettingsDB& parDB, const std::vector<FileRecordData>& parData, const SetRecordData& parSetData) {
-		auto bool_to_str = [](bool b) { return (b ? "true" : "false"); };
+		using std::chrono::system_clock;
+
 		if (parData.empty()) {
 			return;
 		}

-		const std::size_t strtime_buff_size = 512;
-		std::unique_ptr<char[]> strtime_buff(new char[strtime_buff_size]);
-
 		pq::Connection conn(std::string(parDB.username), std::string(parDB.password), std::string(parDB.dbname), std::string(parDB.address), parDB.port);
 		conn.connect();

 		conn.query_void("BEGIN;");
 		conn.query_void(make_set_insert_query(conn, parSetData));
-		//TODO: use COPY instead of INSERT INTO
-		for (std::size_t z = 0; z < parData.size(); z += g_batch_size) {
-			std::ostringstream query;
-			query << "INSERT INTO \"files\" " <<
-				"(path, hash, level, group_id, is_directory, is_symlink, size, " <<
+		for (std::size_t z = 0; z < parData.size(); ++z) {
+			const std::string query = "INSERT INTO \"files\" (path, hash, "
+				"level, group_id, is_directory, is_symlink, size, "
 				"access_time, modify_time, is_hash_valid, unreadable) VALUES "
-			;
+				"($1, $2, $3, currval('\"sets_id_seq\"'), $4, $5, $6, $7, $8, $9, $10);";

-			const char* comma = "";
-			for (auto i = z; i < std::min(z + g_batch_size, parData.size()); ++i) {
-				const auto& itm = parData[i];
-				query << comma;
-				query << '(' << conn.escaped_literal(itm.path) << ",'" << itm.hash << "',"
-					<< itm.level << ','
-					<< "currval('\"sets_id_seq\"')" << ','
-					<< bool_to_str(itm.is_directory) << ','
-					<< (itm.is_symlink ? "true" : "false") << ',' << itm.size
-					<< ',' << '\'' << time_to_str(itm.atime, strtime_buff.get(), strtime_buff_size) << '\''
-					<< ',' << '\'' << time_to_str(itm.mtime, strtime_buff.get(), strtime_buff_size) << '\''
-					<< ',' << bool_to_str(itm.hash_valid)
-					<< ',' << bool_to_str(itm.unreadable)
-				<< ')';
-				comma = ",";
-			}
-			query << ';';
-			//query << "\nCOMMIT;";
-
-			conn.query_void(query.str());
+			const auto& itm = parData[z];
+			conn.query_void(query,
+				itm.path,
+				tiger_to_string(itm.hash),
+				itm.level,
+				itm.is_directory,
+				itm.is_symlink,
+				itm.size,
+				system_clock::from_time_t(itm.atime),
+				system_clock::from_time_t(itm.mtime),
+				itm.hash_valid,
+				itm.unreadable
+			);
 		}
 		conn.query_void("COMMIT;");
 	}
--- a/src/scan/dbbackend.hpp
+++ b/src/scan/dbbackend.hpp
@ -21,40 +21,19 @@
 #include <string>
 #include <vector>
 #include <cstdint>
-#include <boost/utility/string_ref.hpp>
-#include <ctime>

 namespace dinlib {
 	struct SettingsDB;;
 } //namespace dinlib

 namespace din {
-	struct FileRecordData {
-		std::string path;
-		std::string hash;
-		std::time_t atime;
-		std::time_t mtime;
-		uint16_t level;
-		uint64_t size;
-		bool is_directory;
-		bool is_symlink;
-		bool unreadable;
-		bool hash_valid;
-	};
-
-	struct SetRecordDataFull {
-		std::string name;
-		uint32_t disk_number;
-		char type;
-	};
-
-	struct SetRecordData {
-		const boost::string_ref name;
-		const char type;
-	};
+	struct FileRecordData;
+	struct SetRecordData;
+	struct SetRecordDataFull;
+	struct TigerHash;

 	void write_to_db ( const dinlib::SettingsDB& parDB, const std::vector<FileRecordData>& parData, const SetRecordData& parSetData );
-	bool read_from_db ( FileRecordData& parItem, SetRecordDataFull& parSet, const dinlib::SettingsDB& parDB, std::string&& parHash );
+	bool read_from_db ( FileRecordData& parItem, SetRecordDataFull& parSet, const dinlib::SettingsDB& parDB, const TigerHash& parHash );
 } //namespace din

 #endif
--- a/src/scan/indexer.cpp
+++ b/src/scan/indexer.cpp
@ -22,6 +22,7 @@
 #include "dindexer-common/settings.hpp"
 #include "filestats.hpp"
 #include "mimetype.hpp"
+#include "recorddata.hpp"
 #include <algorithm>
 #include <functional>
 #include <vector>
@ -40,50 +41,18 @@
 #if defined(INDEXER_VERBOSE)
 #	include <iostream>
 #endif
+#include <boost/utility/string_ref.hpp>

 namespace din {
-	typedef TigerHash HashType;
-
-	struct FileEntry {
-		FileEntry ( const char* parPath, const fastf::FileStats& parSt ) :
-			path(parPath),
-			hash {},
-			access_time(parSt.atime),
-			modify_time(parSt.mtime),
-			//file_size(0),
-			level(static_cast<uint16_t>(parSt.level)),
-			is_dir(parSt.is_dir),
-			is_symlink(parSt.is_symlink),
-			unreadable(false)
-		{
-		}
-
-		FileEntry ( const FileEntry& ) = delete;
-		FileEntry ( FileEntry&& ) = default;
-		FileEntry& operator= ( const FileEntry& ) = delete;
-		FileEntry& operator= ( FileEntry&& ) = default;
-		bool operator< ( const FileEntry& parOther ) const;
-		bool operator== ( const FileEntry& ) const = delete;
-
-		std::string path;
-		std::string mime;
-		HashType hash;
-		std::time_t access_time;
-		std::time_t modify_time;
-		uint64_t file_size;
-		uint16_t level;
-		bool is_dir;
-		bool is_symlink;
-		bool unreadable;
-	};
+	using HashType = decltype(FileRecordData::hash);

 	namespace {
-		typedef std::vector<FileEntry>::iterator FileEntryIt;
+		typedef std::vector<FileRecordData>::iterator FileEntryIt;

 		void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback, bool parIgnoreErrors, MimeType& parMime) {
 			assert(parEntry != parEnd);
-			assert(parEntry->is_dir);
-			FileEntry& curr_entry = *parEntry;
+			assert(parEntry->is_directory);
+			FileRecordData& curr_entry = *parEntry;
 			auto& curr_entry_it = parEntry;

 			//Build a blob with the hashes and filenames of every directory that
@ -106,10 +75,10 @@ namespace din {
 #if defined(INDEXER_VERBOSE)
 				std::cout << "Making initial hash for " << parCurrDir << "...\n";
 #endif
-				curr_entry.mime = parMime.analyze(it_entry->path);
+				curr_entry.mime_full = parMime.analyze(it_entry->path);
 				while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
 					PathName curr_subdir(it_entry->path);
-					if (it_entry->is_dir) {
+					if (it_entry->is_directory) {
 						hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback, parIgnoreErrors, parMime);

 						std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
@ -128,11 +97,11 @@ namespace din {
 				}

 				tiger_data(dir_blob, curr_entry.hash);
-				curr_entry.file_size = 0;
+				curr_entry.size = 0;
 #if defined(INDEXER_VERBOSE)
 				std::cout << "Got intermediate hash for dir " << parCurrDir <<
 					": " << tiger_to_string(curr_entry.hash) <<
-					' ' << curr_entry.mime << '\n';
+					' ' << curr_entry.mime_type << '\n';
 #endif
 			}

@ -141,7 +110,7 @@ namespace din {
 				auto it_entry = curr_entry_it;
 				while (
 					it_entry != parEnd
-					and (it_entry->is_dir
+					and (it_entry->is_directory
 						or it_entry->level != curr_entry_it->level + 1
 						or PathName(it_entry->path).pop_right() != parCurrDir
 					)
@ -149,15 +118,19 @@ namespace din {
 					++it_entry;
 				}

-				while (it_entry != parEnd and not it_entry->is_dir and it_entry->level == curr_entry_it->level + 1 and PathName(it_entry->path).pop_right() == parCurrDir) {
-					assert(not it_entry->is_dir);
+				while (it_entry != parEnd and not it_entry->is_directory and it_entry->level == curr_entry_it->level + 1 and PathName(it_entry->path).pop_right() == parCurrDir) {
+					assert(not it_entry->is_directory);
 #if defined(INDEXER_VERBOSE)
 					std::cout << "Hashing file " << it_entry->path << "...";
 #endif
 					parNextItemCallback(it_entry - parBegin);
 					try {
-						tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
-						it_entry->mime = parMime.analyze(it_entry->path);
+						tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->size);
+						it_entry->hash_valid = true;
+						it_entry->mime_full = parMime.analyze(it_entry->path);
+						auto mime_pair = split_mime(it_entry->mime_full);
+						it_entry->mime_type = mime_pair.first;
+						it_entry->mime_charset = mime_pair.second;
 					}
 					catch (const std::ios_base::failure& e) {
 						if (parIgnoreErrors) {
@ -170,8 +143,8 @@ namespace din {
 					}

 #if defined(INDEXER_VERBOSE)
-					std::cout << ' ' << tiger_to_string(it_entry->hash) <<
-						' ' << it_entry->mime << '\n';
+					std::cout << ' ' << tiger_to_string(it_entry->hash) << ' ' <<
+						"Mime type: \"" << it_entry->mime_type << "\"\n";
 #endif
 					++it_entry;
 				}
@ -180,16 +153,28 @@ namespace din {
 #if defined(INDEXER_VERBOSE)
 			std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n';
 #endif
+			curr_entry_it->hash_valid = true;
 		}

 		template <bool FileTrue=true>
 		struct IsFile {
-			bool operator() ( const FileEntry& parEntry ) const { return parEntry.is_dir xor FileTrue; }
+			bool operator() ( const FileRecordData& parEntry ) const { return parEntry.is_directory xor FileTrue; }
 		};
+
+		FileRecordData make_file_record_data (const char* parPath, const fastf::FileStats& parSt) {
+			return FileRecordData(
+				parPath,
+				parSt.atime,
+				parSt.mtime,
+				parSt.level,
+				parSt.is_dir,
+				parSt.is_symlink
+			);
+		}
 	} //unnamed namespace

 	struct Indexer::LocalData {
-		typedef std::vector<FileEntry> PathList;
+		typedef std::vector<FileRecordData> PathList;

 		dinlib::SettingsDB db_settings;
 		PathList paths;
@ -202,12 +187,13 @@ namespace din {
 		bool ignore_read_errors;
 	};

-	bool FileEntry::operator< (const FileEntry& parOther) const {
-		const FileEntry& o = parOther;
+	bool file_record_data_lt (const FileRecordData& parLeft, const FileRecordData& parRight) {
+		const FileRecordData& l = parLeft;
+		const FileRecordData& r = parRight;
 		return
-			(level < o.level)
-			or (level == o.level and is_dir and not o.is_dir)
-			or (level == o.level and is_dir == o.is_dir and path < o.path)
+			(l.level < r.level)
+			or (l.level == r.level and l.is_directory and not r.is_directory)
+			or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path)

 			//sort by directory - parent first, children later
 			//(level == o.level and is_dir and not o.is_dir)
@ -257,7 +243,7 @@ namespace din {

 	void Indexer::calculate_hash() {
 		PathName base_path(m_local_data->paths.front().path);
-		std::sort(m_local_data->paths.begin(), m_local_data->paths.end());
+		std::sort(m_local_data->paths.begin(), m_local_data->paths.end(), &file_record_data_lt);
 		MimeType mime;

 #if defined(INDEXER_VERBOSE)
@ -266,7 +252,7 @@ namespace din {
 			itm.hash.part_b = 1;
 			itm.hash.part_c = 1;

-			if (itm.is_dir)
+			if (itm.is_directory)
 				std::cout << "(D) ";
 			else
 				std::cout << "(F) ";
@ -317,41 +303,25 @@ namespace din {
 #endif

 		if (not parForce) {
-			std::string first_hash(tiger_to_string(m_local_data->paths.front().hash, true));
+			const auto& first_hash = m_local_data->paths.front().hash;
 			FileRecordData itm;
 			SetRecordDataFull set;
-			const bool already_in_db = read_from_db(itm, set, m_local_data->db_settings, std::move(first_hash));
+			const bool already_in_db = read_from_db(itm, set, m_local_data->db_settings, first_hash);
 			if (already_in_db) {
 				return false;
 			}
 		}

 		PathName base_path(m_local_data->paths.front().path);
-		std::vector<FileRecordData> data;
-		data.reserve(m_local_data->paths.size());
-		for (const auto& itm : m_local_data->paths) {
-			data.push_back(FileRecordData {
-				make_relative_path(base_path, PathName(itm.path)).path(),
-				tiger_to_string(itm.hash),
-				itm.access_time,
-				itm.modify_time,
-				itm.level,
-				itm.file_size,
-				itm.is_dir,
-				itm.is_symlink,
-				itm.unreadable,
-				not itm.unreadable
-			});
-		}

 		SetRecordData set_data {parSetName, parType};
-		write_to_db(m_local_data->db_settings, data, set_data);
+		write_to_db(m_local_data->db_settings, m_local_data->paths, set_data);
 		return true;
 	}

 	bool Indexer::add_path (const char* parPath, const fastf::FileStats& parStats) {
 		m_local_data->paths.push_back(
-			FileEntry(parPath, parStats));
+			make_file_record_data(parPath, parStats));
 		if (not parStats.is_dir) {
 			++m_local_data->file_count;
 		}
@ -364,14 +334,14 @@ namespace din {

 		std::cout << "---------------- FILE LIST ----------------\n";
 		for (const auto& cur_itm : m_local_data->paths) {
-			if (not cur_itm.is_dir) {
+			if (not cur_itm.is_directory) {
 				PathName cur_path(cur_itm.path);
 				std::cout << make_relative_path(base_path, cur_path).path() << '\n';
 			}
 		}
 		std::cout << "---------------- DIRECTORY LIST ----------------\n";
 		for (const auto& cur_itm : m_local_data->paths) {
-			if (cur_itm.is_dir) {
+			if (cur_itm.is_directory) {
 				PathName cur_path(cur_itm.path);
 				std::cout << make_relative_path(base_path, cur_path).path() << '\n';
 			}
--- a/src/scan/main.cpp
+++ b/src/scan/main.cpp
@ -133,6 +133,10 @@ namespace {
 		parShowProgress = false;
 #endif
 		if (not parShowProgress) {
+//Hashing file /mnt/cdrom/Sacred 2/Fallen Angel/UK/Sacred.2.Fallen.Angel-ArenaBG/DISC2/S2DISC2.md1... 512c201321ed01cc2a82c9f80bfeaaa673bc8eb3cea4e5c1
+//terminate called after throwing an instance of 'std::ios_base::failure'
+//what():  basic_filebuf::xsgetn error reading the file
+//Hashing file /mnt/cdrom/Sacred 2/Fallen Angel/UK/Sacred.2.Fallen.Angel-ArenaBG/DISC2/S2DISC2.mdf...Annullato
 			parIndexer.calculate_hash();
 		}
 #if defined(WITH_PROGRESS_FEEDBACK)
--- a/src/scan/recorddata.hpp
+++ b/src/scan/recorddata.hpp
@ -0,0 +1,86 @@
+/* Copyright 2015, Michele Santullo
+ * This file is part of "dindexer".
+ *
+ * "dindexer" is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * "dindexer" is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef id3CD7F105AC314540A864487E981E5A7E
+#define id3CD7F105AC314540A864487E981E5A7E
+
+#include "tiger.hpp"
+#include <string>
+#include <boost/utility/string_ref.hpp>
+#include <cstdint>
+#include <ctime>
+#include <boost/flyweight.hpp>
+#include <boost/flyweight/no_locking.hpp>
+
+namespace din {
+	struct FileRecordData {
+		struct MimeStringTagStruct { };
+		typedef boost::flyweights::tag<MimeStringTagStruct> MimeStringTag;
+		typedef boost::flyweight<std::string, boost::flyweights::no_locking, MimeStringTag> mime_string;
+
+		FileRecordData ( void ) = default;
+		FileRecordData ( const char* parPath, std::time_t parATime, std::time_t parMTime, uint64_t parLevel, bool parIsDir, bool parIsSymLink ) :
+			hash {},
+			path(parPath),
+			mime_full(),
+			atime(parATime),
+			mtime(parMTime),
+			mime_type(),
+			mime_charset(),
+			size(0),
+			level(parLevel),
+			is_directory(parIsDir),
+			is_symlink(parIsSymLink),
+			unreadable(false),
+			hash_valid(false)
+		{
+		}
+
+		FileRecordData ( const FileRecordData& ) = delete;
+		FileRecordData ( FileRecordData&& ) = default;
+		FileRecordData& operator= ( const FileRecordData& ) = delete;
+		FileRecordData& operator= ( FileRecordData&& ) = default;
+		bool operator== ( const FileRecordData& ) const = delete;
+
+		TigerHash hash;
+		std::string path;
+		mime_string mime_full;
+		std::time_t atime;
+		std::time_t mtime;
+		boost::string_ref mime_type;
+		boost::string_ref mime_charset;
+		uint64_t size;
+		uint16_t level;
+		bool is_directory;
+		bool is_symlink;
+		bool unreadable;
+		bool hash_valid;
+	};
+
+	struct SetRecordDataFull {
+		std::string name;
+		uint32_t disk_number;
+		char type;
+	};
+
+	struct SetRecordData {
+		const boost::string_ref name;
+		const char type;
+	};
+} //namespace din
+
+#endif