Implement hashing task.

Also get rid of the ShorFileRecordData and put the new LeanBase class to use.
2024-11-29 01:33:46 +00:00 · 2016-03-08 08:48:12 +01:00 · 2016-03-08 08:48:12 +01:00 · d2588d3c7e
commit d2588d3c7e
parent c64e572fc8
9 changed files with 247 additions and 52 deletions
--- a/include/dindexer-machinery/recorddata.hpp
+++ b/include/dindexer-machinery/recorddata.hpp
@ -51,6 +51,24 @@ namespace mchlib {
 		{
 		}

+		FileRecordData ( std::string&& parPath, std::size_t parRelPathOffs, std::time_t parATime, std::time_t parMTime, uint16_t parLevel, bool parIsDir, bool parIsSymLink ) :
+			hash {},
+			abs_path(std::move(parPath)),
+			mime_full(),
+			atime(parATime),
+			mtime(parMTime),
+			path(boost::string_ref(abs_path).substr(parRelPathOffs)),
+			mime_type(),
+			mime_charset(),
+			size(0),
+			level(parLevel),
+			is_directory(parIsDir),
+			is_symlink(parIsSymLink),
+			unreadable(false),
+			hash_valid(false)
+	{
+	}
+
 #if defined(NDEBUG)
 		FileRecordData ( const FileRecordData& ) = delete;
 #else
@ -79,16 +97,6 @@ namespace mchlib {
 		bool hash_valid;
 	};

-	struct ShortFileRecordData {
-		std::string abs_path;
-		std::string path;
-		std::time_t atime;
-		std::time_t mtime;
-		uint16_t level;
-		bool is_directory;
-		bool is_symlink;
-	};
-
 	struct SetRecordData {
 		boost::string_ref name;
 		char type;
--- a/include/dindexer-machinery/scantask/dirtree.hpp
+++ b/include/dindexer-machinery/scantask/dirtree.hpp
@ -23,12 +23,12 @@
 #include <vector>

 namespace mchlib {
-	struct ShortFileRecordData;
+	struct FileRecordData;

 	namespace scantask {
-		class DirTree : public Base<std::vector<ShortFileRecordData>> {
+		class DirTree : public Base<std::vector<FileRecordData>> {
 		public:
-			typedef std::vector<ShortFileRecordData> PathList;
+			typedef std::vector<FileRecordData> PathList;

 			explicit DirTree ( std::string parRoot );
 			virtual ~DirTree ( void ) noexcept = default;
--- a/include/dindexer-machinery/scantask/hashing.hpp
+++ b/include/dindexer-machinery/scantask/hashing.hpp
@ -0,0 +1,47 @@
+/* Copyright 2015, 2016, Michele Santullo
+ * This file is part of "dindexer".
+ *
+ * "dindexer" is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * "dindexer" is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef idC7CC55298AC049EAA80604D6C7FD081D
+#define idC7CC55298AC049EAA80604D6C7FD081D
+
+#include "dindexer-machinery/scantask/leanbase.hpp"
+#include "dindexer-machinery/tiger.hpp"
+#include <vector>
+#include <memory>
+
+namespace mchlib {
+	struct FileRecordData;
+
+	namespace scantask {
+		class Hashing : public LeanBase<std::vector<FileRecordData>> {
+		public:
+			typedef LeanBase<std::vector<FileRecordData>> FileTreeBase;
+
+			Hashing ( std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors );
+			virtual ~Hashing ( void ) noexcept;
+
+		private:
+			virtual void on_data_fill ( void ) override;
+			virtual std::vector<FileRecordData>& on_data_get ( void ) override;
+
+			std::shared_ptr<FileTreeBase> m_file_tree_task;
+			bool m_ignore_errors;
+		};
+	} //namespace scantask
+} //namespace mchlib
+
+#endif
--- a/include/dindexer-machinery/set_listing.hpp
+++ b/include/dindexer-machinery/set_listing.hpp
@ -40,19 +40,21 @@ namespace mchlib {
 	template <bool Const>
 	implem::DirIterator<Const> first_file ( SetListingView<Const>& parList );

+	typedef FileRecordData SetListingItemType;
+
 	namespace implem {
 		template <bool Const>
-		class DirIterator : public boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> {
+		class DirIterator : public boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> {
 			friend class mchlib::SetListingView<Const>;
 			friend class boost::iterator_core_access;
 			template <bool> friend class DirIterator;
-			typedef boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> base_class;
+			typedef boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> base_class;
 			struct enabler {};
 		public:
 			typedef typename std::conditional<
 				Const,
-				std::vector<mchlib::FileRecordData>::const_iterator,
-				std::vector<mchlib::FileRecordData>::iterator
+				std::vector<SetListingItemType>::const_iterator,
+				std::vector<SetListingItemType>::iterator
 			>::type VecIterator;
 			typedef typename base_class::difference_type difference_type;
 			typedef typename base_class::value_type value_type;
@ -127,8 +129,7 @@ namespace mchlib {

 	class SetListing {
 	public:
-		typedef std::vector<FileRecordData> ListType;
-		typedef std::vector<ShortFileRecordData> ShortListType;
+		typedef std::vector<SetListingItemType> ListType;
 		typedef implem::DirIterator<true> const_iterator;

 		explicit SetListing ( ListType&& parList, bool parSort=true );
@ -152,7 +153,6 @@ namespace mchlib {

 		static void sort_list ( ListType& parList );
 		static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
-		static ShortListType::iterator lower_bound ( ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );

 	private:
 		ListType m_list;
--- a/src/machinery/CMakeLists.txt
+++ b/src/machinery/CMakeLists.txt
@ -18,6 +18,7 @@ add_library(${PROJECT_NAME} SHARED
 	globbing.cpp
 	scantask/dirtree.cpp
 	scantask/mediatype.cpp
+	scantask/hashing.cpp
 )

 #target_include_directories(${PROJECT_NAME}
--- a/src/machinery/scantask/dirtree.cpp
+++ b/src/machinery/scantask/dirtree.cpp
@ -18,6 +18,7 @@
 #include "dindexer-machinery/scantask/dirtree.hpp"
 #include "dindexer-machinery/recorddata.hpp"
 #include "dindexer-machinery/set_listing.hpp"
+#include "helpers/compatibility.h"
 #include "filesearcher.hpp"
 #include "pathname.hpp"
 #include <utility>
@ -28,6 +29,17 @@

 namespace mchlib {
 	namespace {
+		std::size_t calc_rel_path_offs ( const PathName& parRoot, boost::string_ref parPath ) a_pure;
+
+		std::size_t calc_rel_path_offs (const PathName& parRoot, boost::string_ref parPath) {
+			PathName path(parPath);
+			PathName rel_path = make_relative_path(parRoot, path);
+			const auto rel_path_len = rel_path.str_path_size();
+			const auto path_len = path.str_path_size();
+			assert(rel_path_len <= path_len);
+			return path_len - rel_path_len;
+		}
+
 		bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) {
 			using boost::string_ref;

@ -48,15 +60,15 @@ namespace mchlib {

 			parOut.insert(
 				it_before,
-				ShortFileRecordData {
-					std::string(parPath),
-					make_relative_path(parRoot, PathName(string_ref(parPath))).path(),
+				FileRecordData(
+					parPath,
+					calc_rel_path_offs(parRoot, string_ref(parPath)),
 					parStats.atime,
 					parStats.mtime,
 					static_cast<uint16_t>(parStats.level),
 					static_cast<bool>(parStats.is_dir),
 					static_cast<bool>(parStats.is_symlink)
-				}
+				)
 			);
 			return true;
 		}
--- a/src/machinery/scantask/hashing.cpp
+++ b/src/machinery/scantask/hashing.cpp
@ -0,0 +1,124 @@
+/* Copyright 2015, 2016, Michele Santullo
+ * This file is part of "dindexer".
+ *
+ * "dindexer" is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * "dindexer" is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "dindexer-machinery/scantask/hashing.hpp"
+#include "dindexer-machinery/recorddata.hpp"
+#include "dindexer-machinery/set_listing.hpp"
+#include "pathname.hpp"
+#include <cassert>
+#include <boost/range/empty.hpp>
+#include <boost/utility/string_ref.hpp>
+
+namespace mchlib {
+	namespace {
+
+		void append_to_vec (std::vector<char>& parDest, const TigerHash& parHash, boost::string_ref parString) {
+			const auto old_size = parDest.size();
+			parDest.resize(old_size + sizeof(TigerHash) + parString.size());
+			std::copy(parHash.byte_data, parHash.byte_data + sizeof(TigerHash), parDest.begin() + old_size);
+			std::copy(parString.begin(), parString.end(), parDest.begin() + old_size + sizeof(TigerHash));
+		}
+
+		void append_to_vec (std::vector<char>& parDest, boost::string_ref parString) {
+			const auto old_size = parDest.size();
+			parDest.resize(old_size + parString.size());
+			std::copy(parString.begin(), parString.end(), parDest.begin() + old_size);
+		}
+
+		void hash_dir (FileRecordData& parEntry, MutableSetListingView& parList, bool parIgnoreErrors) {
+			assert(parEntry.is_directory);
+
+			//Build a blob with the hashes and filenames of every directory that
+			//is a direct child of current entry
+			std::vector<char> dir_blob;
+#if defined(INDEXER_VERBOSE)
+			std::cout << "Making initial hash for " << parCurrDir << "...\n";
+#endif
+			for (auto it = parList.begin(); it != parList.end(); ++it) {
+				assert(PathName(parEntry.abs_path) == PathName(it->abs_path).pop_right());
+
+				if (it->is_directory) {
+					auto cd_list = MutableSetListingView(it);
+					assert(boost::empty(cd_list) or cd_list.begin()->abs_path != it->abs_path);
+
+					hash_dir(*it, cd_list, parIgnoreErrors);
+					append_to_vec(dir_blob, it->hash, it->path);
+				}
+				else {
+					append_to_vec(dir_blob, it->path);
+				}
+			}
+			tiger_data(dir_blob, parEntry.hash);
+
+#if defined(INDEXER_VERBOSE)
+			std::cout << "Got intermediate hash for dir " << parCurrDir <<
+				": " << tiger_to_string(parEntry.hash) <<
+				' ' << parEntry.mime_type << '\n';
+#endif
+
+			//Now with the initial hash ready, let's start hashing files, if any
+			for (auto it = first_file(parList); it != parList.end(); ++it) {
+				assert(not it->is_directory);
+#if defined(INDEXER_VERBOSE)
+				std::cout << "Hashing file " << it->abs_path << "...";
+#endif
+				//TODO: notify callback
+				try {
+					tiger_file(it->abs_path, it->hash, parEntry.hash, it->size);
+					it->hash_valid = true;
+				}
+				catch (const std::ios_base::failure& e) {
+					if (parIgnoreErrors) {
+						it->unreadable = true;
+						it->hash = TigerHash {};
+					}
+					else {
+						throw e;
+					}
+				}
+			}
+
+#if defined(INDEXER_VERBOSE)
+			std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry.hash) << '\n';
+#endif
+			parEntry.hash_valid = true;
+		}
+	} //unnamed namespace
+
+	namespace scantask {
+		Hashing::Hashing (std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors) :
+			m_file_tree_task(parFileTree),
+			m_ignore_errors(parIgnoreErrors)
+		{
+			assert(m_file_tree_task);
+		}
+
+		Hashing::~Hashing() noexcept {
+		}
+
+		std::vector<FileRecordData>& Hashing::on_data_get() {
+			return m_file_tree_task->get_or_create();
+		}
+
+		void Hashing::on_data_fill() {
+			std::vector<FileRecordData>& file_list = m_file_tree_task->get_or_create();
+
+			MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0);
+			hash_dir(file_list.front(), recordlist, m_ignore_errors);
+		}
+	} //namespace scantask
+} //namespace mchlib
--- a/src/machinery/set_listing.cpp
+++ b/src/machinery/set_listing.cpp
@ -29,26 +29,26 @@ namespace mchlib {
 		//to be made.
 		struct FileRecordDataForSearch {
 			FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) :
-				abs_path(parPath),
+				path(parPath),
 				level(parLevel),
 				is_directory(parIsDir)
 			{
 				assert(parPath);
 			}

-			boost::string_ref abs_path;
+			boost::string_ref path;
 			uint16_t level;
 			bool is_directory;
 		};

-		template <typename RecordType, typename OtherRecord>
-		bool file_record_data_lt (const RecordType& parLeft, const OtherRecord& parRight) {
-			const RecordType& l = parLeft;
+		template <typename OtherRecord>
+		bool file_record_data_lt (const SetListingItemType& parLeft, const OtherRecord& parRight) {
+			const SetListingItemType& l = parLeft;
 			const OtherRecord& r = parRight;
 			return
 				(l.level < r.level)
 				or (l.level == r.level and l.is_directory and not r.is_directory)
-				or (l.level == r.level and l.is_directory == r.is_directory and l.abs_path < r.abs_path)
+				or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path)

 				//sort by directory - parent first, children later
 				//(level == o.level and is_dir and not o.is_dir)
@ -99,14 +99,14 @@ namespace mchlib {
 		{
 			assert(parBasePath);
 			assert(m_base_path or m_current == m_end);
-			assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->abs_path).atom_count());
+			assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->path).atom_count());
 			assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset);

 			//Look for the point where the children of this entry start
 			while (
 				m_current != m_end and (
 					m_current->level + m_level_offset == m_base_path->atom_count() or
-					*m_base_path != PathName(m_current->abs_path).pop_right()
+					*m_base_path != PathName(m_current->path).pop_right()
 			)) {
 				assert(m_base_path);
 				++m_current;
@ -157,13 +157,13 @@ namespace mchlib {

 		template <bool Const>
 		void DirIterator<Const>::increment() {
-			assert(PathName(m_current->abs_path).pop_right() == *m_base_path);
+			assert(PathName(m_current->path).pop_right() == *m_base_path);
 			do {
 				++m_current;
 			} while(
 				m_current != m_end and
 				m_current->level + m_level_offset == m_base_path->atom_count() + 1 and
-				*m_base_path != PathName(m_current->abs_path).pop_right()
+				*m_base_path != PathName(m_current->path).pop_right()
 			);
 		}

@ -222,7 +222,7 @@ namespace mchlib {
 			assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin()));
 		}
 		if (not m_list.empty()) {
-			m_base_path.reset(new PathName(m_list.front().abs_path));
+			m_base_path.reset(new PathName(m_list.front().path));
 		}
 	}

@ -258,7 +258,7 @@ namespace mchlib {
 		return std::count_if(
 			m_list.begin(),
 			m_list.end(),
-			[] (const FileRecordData& parItm) {
+			[] (const SetListingItemType& parItm) {
 				return not parItm.is_directory;
 			}
 		);
@ -268,7 +268,7 @@ namespace mchlib {
 		return std::count_if(
 			m_list.begin(),
 			m_list.end(),
-			[] (const FileRecordData& parItm) {
+			[] (const SetListingItemType& parItm) {
 				return parItm.is_directory;
 			}
 		);
@ -279,33 +279,27 @@ namespace mchlib {
 	}

 	void SetListing::sort_list (ListType& parList) {
-		std::sort(parList.begin(), parList.end(), &file_record_data_lt<FileRecordData, FileRecordData>);
+		std::sort(parList.begin(), parList.end(), &file_record_data_lt<SetListingItemType>);
 	}

 	SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
 		using boost::string_ref;
 		FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
-		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordData, FileRecordDataForSearch>);
-	}
-
-	SetListing::ShortListType::iterator SetListing::lower_bound (ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
-		using boost::string_ref;
-		FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
-		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<ShortFileRecordData, FileRecordDataForSearch>);
+		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordDataForSearch>);
 	}

 	SetListingView<false> SetListing::make_view() {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<false>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}

 	SetListingView<true> SetListing::make_view() const {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}

 	SetListingView<true> SetListing::make_cview() const {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}

@ -317,7 +311,7 @@ namespace mchlib {
 		m_level_offset(parIter.m_level_offset)
 	{
 		if (m_begin != m_end) {
-			m_base_path.reset(new PathName(m_begin->abs_path));
+			m_base_path.reset(new PathName(m_begin->path));
 		}
 	}

@ -329,7 +323,7 @@ namespace mchlib {
 		m_level_offset(parLevelOffset)
 	{
 		if (m_begin != m_end) {
-			m_base_path.reset(new PathName(m_begin->abs_path));
+			m_base_path.reset(new PathName(m_begin->path));
 		}
 	}

--- a/src/scan/main.cpp
+++ b/src/scan/main.cpp
@ -28,6 +28,7 @@
 #include "dbbackend.hpp"
 #include "dindexer-machinery/scantask/dirtree.hpp"
 #include "dindexer-machinery/scantask/mediatype.hpp"
+#include "dindexer-machinery/scantask/hashing.hpp"
 #include <iostream>
 #include <iomanip>
 #include <ciso646>
@ -76,8 +77,16 @@ int main (int parArgc, char* parArgv[]) {
 	}

 	const std::string search_path(vm["search-path"].as<std::string>());
-	mchlib::scantask::DirTree scan_dirtree(search_path);
-	mchlib::scantask::MediaType media_type(vm["type"].as<char>(), not vm.count("type"), search_path);
+	std::shared_ptr<mchlib::scantask::DirTree> scan_dirtree(new mchlib::scantask::DirTree(search_path));
+	std::shared_ptr<mchlib::scantask::MediaType> media_type(new mchlib::scantask::MediaType((vm.count("type") ? vm["type"].as<char>() : 'O'), not vm.count("type"), search_path));
+	std::shared_ptr<mchlib::scantask::Hashing> hashing(new mchlib::scantask::Hashing(scan_dirtree, true));
+
+	const auto& hashes = hashing->get_or_create();
+	for (const auto& hash : hashes) {
+		std::cout << mchlib::tiger_to_string(hash.hash) << std::endl;
+	}
+
+	return 0;

 #if defined(WITH_MEDIA_AUTODETECT)
 	//char set_type;