Implement hashing task.

Also get rid of the ShorFileRecordData and put the new LeanBase class to use.
2025-08-21 15:50:50 +00:00 · 2016-03-08 08:48:12 +01:00 · 2016-03-08 08:48:12 +01:00 · d2588d3c7e
commit d2588d3c7e
parent c64e572fc8
9 changed files with 247 additions and 52 deletions
--- a/include/dindexer-machinery/recorddata.hpp
+++ b/include/dindexer-machinery/recorddata.hpp
@ -51,6 +51,24 @@ namespace mchlib {
 		{
 		}
 		FileRecordData ( std::string&& parPath, std::size_t parRelPathOffs, std::time_t parATime, std::time_t parMTime, uint16_t parLevel, bool parIsDir, bool parIsSymLink ) :
 			hash {},
 			abs_path(std::move(parPath)),
 			mime_full(),
 			atime(parATime),
 			mtime(parMTime),
 			path(boost::string_ref(abs_path).substr(parRelPathOffs)),
 			mime_type(),
 			mime_charset(),
 			size(0),
 			level(parLevel),
 			is_directory(parIsDir),
 			is_symlink(parIsSymLink),
 			unreadable(false),
 			hash_valid(false)
 	{
 	}
 #if defined(NDEBUG)
 		FileRecordData ( const FileRecordData& ) = delete;
 #else
@ -79,16 +97,6 @@ namespace mchlib {
 		bool hash_valid;
 	};
 	struct ShortFileRecordData {
 		std::string abs_path;
 		std::string path;
 		std::time_t atime;
 		std::time_t mtime;
 		uint16_t level;
 		bool is_directory;
 		bool is_symlink;
 	};
 	struct SetRecordData {
 		boost::string_ref name;
 		char type;
--- a/include/dindexer-machinery/scantask/dirtree.hpp
+++ b/include/dindexer-machinery/scantask/dirtree.hpp
@ -23,12 +23,12 @@
 #include <vector>
 namespace mchlib {
-	struct ShortFileRecordData;
+	struct FileRecordData;
 	namespace scantask {
-		class DirTree : public Base<std::vector<ShortFileRecordData>> {
+		class DirTree : public Base<std::vector<FileRecordData>> {
 		public:
-			typedef std::vector<ShortFileRecordData> PathList;
+			typedef std::vector<FileRecordData> PathList;
 			explicit DirTree ( std::string parRoot );
 			virtual ~DirTree ( void ) noexcept = default;
--- a/include/dindexer-machinery/scantask/hashing.hpp
+++ b/include/dindexer-machinery/scantask/hashing.hpp
@ -0,0 +1,47 @@
 /* Copyright 2015, 2016, Michele Santullo
 * This file is part of "dindexer".
 *
 * "dindexer" is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * "dindexer" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef idC7CC55298AC049EAA80604D6C7FD081D
 #define idC7CC55298AC049EAA80604D6C7FD081D
 #include "dindexer-machinery/scantask/leanbase.hpp"
 #include "dindexer-machinery/tiger.hpp"
 #include <vector>
 #include <memory>
 namespace mchlib {
 	struct FileRecordData;
 	namespace scantask {
 		class Hashing : public LeanBase<std::vector<FileRecordData>> {
 		public:
 			typedef LeanBase<std::vector<FileRecordData>> FileTreeBase;
 			Hashing ( std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors );
 			virtual ~Hashing ( void ) noexcept;
 		private:
 			virtual void on_data_fill ( void ) override;
 			virtual std::vector<FileRecordData>& on_data_get ( void ) override;
 			std::shared_ptr<FileTreeBase> m_file_tree_task;
 			bool m_ignore_errors;
 		};
 	} //namespace scantask
 } //namespace mchlib
 #endif
--- a/include/dindexer-machinery/set_listing.hpp
+++ b/include/dindexer-machinery/set_listing.hpp
@ -40,19 +40,21 @@ namespace mchlib {
 	template <bool Const>
 	implem::DirIterator<Const> first_file ( SetListingView<Const>& parList );
 	typedef FileRecordData SetListingItemType;
 	namespace implem {
 		template <bool Const>
-		class DirIterator : public boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> {
+		class DirIterator : public boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> {
 			friend class mchlib::SetListingView<Const>;
 			friend class boost::iterator_core_access;
 			template <bool> friend class DirIterator;
-			typedef boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> base_class;
+			typedef boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> base_class;
 			struct enabler {};
 		public:
 			typedef typename std::conditional<
 				Const,
-				std::vector<mchlib::FileRecordData>::const_iterator,
+				std::vector<SetListingItemType>::const_iterator,
-				std::vector<mchlib::FileRecordData>::iterator
+				std::vector<SetListingItemType>::iterator
 			>::type VecIterator;
 			typedef typename base_class::difference_type difference_type;
 			typedef typename base_class::value_type value_type;
@ -127,8 +129,7 @@ namespace mchlib {
 	class SetListing {
 	public:
-		typedef std::vector<FileRecordData> ListType;
+		typedef std::vector<SetListingItemType> ListType;
 		typedef std::vector<ShortFileRecordData> ShortListType;
 		typedef implem::DirIterator<true> const_iterator;
 		explicit SetListing ( ListType&& parList, bool parSort=true );
@ -152,7 +153,6 @@ namespace mchlib {
 		static void sort_list ( ListType& parList );
 		static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
 		static ShortListType::iterator lower_bound ( ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
 	private:
 		ListType m_list;
--- a/src/machinery/CMakeLists.txt
+++ b/src/machinery/CMakeLists.txt
@ -18,6 +18,7 @@ add_library(${PROJECT_NAME} SHARED
 	globbing.cpp
 	scantask/dirtree.cpp
 	scantask/mediatype.cpp
 	scantask/hashing.cpp
 )
 #target_include_directories(${PROJECT_NAME}
--- a/src/machinery/scantask/dirtree.cpp
+++ b/src/machinery/scantask/dirtree.cpp
@ -18,6 +18,7 @@
 #include "dindexer-machinery/scantask/dirtree.hpp"
 #include "dindexer-machinery/recorddata.hpp"
 #include "dindexer-machinery/set_listing.hpp"
 #include "helpers/compatibility.h"
 #include "filesearcher.hpp"
 #include "pathname.hpp"
 #include <utility>
@ -28,6 +29,17 @@
 namespace mchlib {
 	namespace {
 		std::size_t calc_rel_path_offs ( const PathName& parRoot, boost::string_ref parPath ) a_pure;
 		std::size_t calc_rel_path_offs (const PathName& parRoot, boost::string_ref parPath) {
 			PathName path(parPath);
 			PathName rel_path = make_relative_path(parRoot, path);
 			const auto rel_path_len = rel_path.str_path_size();
 			const auto path_len = path.str_path_size();
 			assert(rel_path_len <= path_len);
 			return path_len - rel_path_len;
 		}
 		bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) {
 			using boost::string_ref;
@ -48,15 +60,15 @@ namespace mchlib {
 			parOut.insert(
 				it_before,
-				ShortFileRecordData {
+				FileRecordData(
-					std::string(parPath),
+					parPath,
-					make_relative_path(parRoot, PathName(string_ref(parPath))).path(),
+					calc_rel_path_offs(parRoot, string_ref(parPath)),
 					parStats.atime,
 					parStats.mtime,
 					static_cast<uint16_t>(parStats.level),
 					static_cast<bool>(parStats.is_dir),
 					static_cast<bool>(parStats.is_symlink)
-				}
+				)
 			);
 			return true;
 		}
--- a/src/machinery/scantask/hashing.cpp
+++ b/src/machinery/scantask/hashing.cpp
@ -0,0 +1,124 @@
 /* Copyright 2015, 2016, Michele Santullo
 * This file is part of "dindexer".
 *
 * "dindexer" is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * "dindexer" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "dindexer-machinery/scantask/hashing.hpp"
 #include "dindexer-machinery/recorddata.hpp"
 #include "dindexer-machinery/set_listing.hpp"
 #include "pathname.hpp"
 #include <cassert>
 #include <boost/range/empty.hpp>
 #include <boost/utility/string_ref.hpp>
 namespace mchlib {
 	namespace {
 		void append_to_vec (std::vector<char>& parDest, const TigerHash& parHash, boost::string_ref parString) {
 			const auto old_size = parDest.size();
 			parDest.resize(old_size + sizeof(TigerHash) + parString.size());
 			std::copy(parHash.byte_data, parHash.byte_data + sizeof(TigerHash), parDest.begin() + old_size);
 			std::copy(parString.begin(), parString.end(), parDest.begin() + old_size + sizeof(TigerHash));
 		}
 		void append_to_vec (std::vector<char>& parDest, boost::string_ref parString) {
 			const auto old_size = parDest.size();
 			parDest.resize(old_size + parString.size());
 			std::copy(parString.begin(), parString.end(), parDest.begin() + old_size);
 		}
 		void hash_dir (FileRecordData& parEntry, MutableSetListingView& parList, bool parIgnoreErrors) {
 			assert(parEntry.is_directory);
 			//Build a blob with the hashes and filenames of every directory that
 			//is a direct child of current entry
 			std::vector<char> dir_blob;
 #if defined(INDEXER_VERBOSE)
 			std::cout << "Making initial hash for " << parCurrDir << "...\n";
 #endif
 			for (auto it = parList.begin(); it != parList.end(); ++it) {
 				assert(PathName(parEntry.abs_path) == PathName(it->abs_path).pop_right());
 				if (it->is_directory) {
 					auto cd_list = MutableSetListingView(it);
 					assert(boost::empty(cd_list) or cd_list.begin()->abs_path != it->abs_path);
 					hash_dir(*it, cd_list, parIgnoreErrors);
 					append_to_vec(dir_blob, it->hash, it->path);
 				}
 				else {
 					append_to_vec(dir_blob, it->path);
 				}
 			}
 			tiger_data(dir_blob, parEntry.hash);
 #if defined(INDEXER_VERBOSE)
 			std::cout << "Got intermediate hash for dir " << parCurrDir <<
 				": " << tiger_to_string(parEntry.hash) <<
 				' ' << parEntry.mime_type << '\n';
 #endif
 			//Now with the initial hash ready, let's start hashing files, if any
 			for (auto it = first_file(parList); it != parList.end(); ++it) {
 				assert(not it->is_directory);
 #if defined(INDEXER_VERBOSE)
 				std::cout << "Hashing file " << it->abs_path << "...";
 #endif
 				//TODO: notify callback
 				try {
 					tiger_file(it->abs_path, it->hash, parEntry.hash, it->size);
 					it->hash_valid = true;
 				}
 				catch (const std::ios_base::failure& e) {
 					if (parIgnoreErrors) {
 						it->unreadable = true;
 						it->hash = TigerHash {};
 					}
 					else {
 						throw e;
 					}
 				}
 			}
 #if defined(INDEXER_VERBOSE)
 			std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry.hash) << '\n';
 #endif
 			parEntry.hash_valid = true;
 		}
 	} //unnamed namespace
 	namespace scantask {
 		Hashing::Hashing (std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors) :
 			m_file_tree_task(parFileTree),
 			m_ignore_errors(parIgnoreErrors)
 		{
 			assert(m_file_tree_task);
 		}
 		Hashing::~Hashing() noexcept {
 		}
 		std::vector<FileRecordData>& Hashing::on_data_get() {
 			return m_file_tree_task->get_or_create();
 		}
 		void Hashing::on_data_fill() {
 			std::vector<FileRecordData>& file_list = m_file_tree_task->get_or_create();
 			MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0);
 			hash_dir(file_list.front(), recordlist, m_ignore_errors);
 		}
 	} //namespace scantask
 } //namespace mchlib
--- a/src/machinery/set_listing.cpp
+++ b/src/machinery/set_listing.cpp
@ -29,26 +29,26 @@ namespace mchlib {
 		//to be made.
 		struct FileRecordDataForSearch {
 			FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) :
-				abs_path(parPath),
+				path(parPath),
 				level(parLevel),
 				is_directory(parIsDir)
 			{
 				assert(parPath);
 			}
-			boost::string_ref abs_path;
+			boost::string_ref path;
 			uint16_t level;
 			bool is_directory;
 		};
-		template <typename RecordType, typename OtherRecord>
+		template <typename OtherRecord>
-		bool file_record_data_lt (const RecordType& parLeft, const OtherRecord& parRight) {
+		bool file_record_data_lt (const SetListingItemType& parLeft, const OtherRecord& parRight) {
-			const RecordType& l = parLeft;
+			const SetListingItemType& l = parLeft;
 			const OtherRecord& r = parRight;
 			return
 				(l.level < r.level)
 				or (l.level == r.level and l.is_directory and not r.is_directory)
-				or (l.level == r.level and l.is_directory == r.is_directory and l.abs_path < r.abs_path)
+				or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path)
 				//sort by directory - parent first, children later
 				//(level == o.level and is_dir and not o.is_dir)
@ -99,14 +99,14 @@ namespace mchlib {
 		{
 			assert(parBasePath);
 			assert(m_base_path or m_current == m_end);
-			assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->abs_path).atom_count());
+			assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->path).atom_count());
 			assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset);
 			//Look for the point where the children of this entry start
 			while (
 				m_current != m_end and (
 					m_current->level + m_level_offset == m_base_path->atom_count() or
-					*m_base_path != PathName(m_current->abs_path).pop_right()
+					*m_base_path != PathName(m_current->path).pop_right()
 			)) {
 				assert(m_base_path);
 				++m_current;
@ -157,13 +157,13 @@ namespace mchlib {
 		template <bool Const>
 		void DirIterator<Const>::increment() {
-			assert(PathName(m_current->abs_path).pop_right() == *m_base_path);
+			assert(PathName(m_current->path).pop_right() == *m_base_path);
 			do {
 				++m_current;
 			} while(
 				m_current != m_end and
 				m_current->level + m_level_offset == m_base_path->atom_count() + 1 and
-				*m_base_path != PathName(m_current->abs_path).pop_right()
+				*m_base_path != PathName(m_current->path).pop_right()
 			);
 		}
@ -222,7 +222,7 @@ namespace mchlib {
 			assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin()));
 		}
 		if (not m_list.empty()) {
-			m_base_path.reset(new PathName(m_list.front().abs_path));
+			m_base_path.reset(new PathName(m_list.front().path));
 		}
 	}
@ -258,7 +258,7 @@ namespace mchlib {
 		return std::count_if(
 			m_list.begin(),
 			m_list.end(),
-			[] (const FileRecordData& parItm) {
+			[] (const SetListingItemType& parItm) {
 				return not parItm.is_directory;
 			}
 		);
@ -268,7 +268,7 @@ namespace mchlib {
 		return std::count_if(
 			m_list.begin(),
 			m_list.end(),
-			[] (const FileRecordData& parItm) {
+			[] (const SetListingItemType& parItm) {
 				return parItm.is_directory;
 			}
 		);
@ -279,33 +279,27 @@ namespace mchlib {
 	}
 	void SetListing::sort_list (ListType& parList) {
-		std::sort(parList.begin(), parList.end(), &file_record_data_lt<FileRecordData, FileRecordData>);
+		std::sort(parList.begin(), parList.end(), &file_record_data_lt<SetListingItemType>);
 	}
 	SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
 		using boost::string_ref;
 		FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
-		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordData, FileRecordDataForSearch>);
+		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordDataForSearch>);
 	}
 	SetListing::ShortListType::iterator SetListing::lower_bound (ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
 		using boost::string_ref;
 		FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
 		return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<ShortFileRecordData, FileRecordDataForSearch>);
 	}
 	SetListingView<false> SetListing::make_view() {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<false>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}
 	SetListingView<true> SetListing::make_view() const {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}
 	SetListingView<true> SetListing::make_cview() const {
-		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
+		const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
 		return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
 	}
@ -317,7 +311,7 @@ namespace mchlib {
 		m_level_offset(parIter.m_level_offset)
 	{
 		if (m_begin != m_end) {
-			m_base_path.reset(new PathName(m_begin->abs_path));
+			m_base_path.reset(new PathName(m_begin->path));
 		}
 	}
@ -329,7 +323,7 @@ namespace mchlib {
 		m_level_offset(parLevelOffset)
 	{
 		if (m_begin != m_end) {
-			m_base_path.reset(new PathName(m_begin->abs_path));
+			m_base_path.reset(new PathName(m_begin->path));
 		}
 	}
--- a/src/scan/main.cpp
+++ b/src/scan/main.cpp
@ -28,6 +28,7 @@
 #include "dbbackend.hpp"
 #include "dindexer-machinery/scantask/dirtree.hpp"
 #include "dindexer-machinery/scantask/mediatype.hpp"
 #include "dindexer-machinery/scantask/hashing.hpp"
 #include <iostream>
 #include <iomanip>
 #include <ciso646>
@ -76,8 +77,16 @@ int main (int parArgc, char* parArgv[]) {
 	}
 	const std::string search_path(vm["search-path"].as<std::string>());
-	mchlib::scantask::DirTree scan_dirtree(search_path);
+	std::shared_ptr<mchlib::scantask::DirTree> scan_dirtree(new mchlib::scantask::DirTree(search_path));
-	mchlib::scantask::MediaType media_type(vm["type"].as<char>(), not vm.count("type"), search_path);
+	std::shared_ptr<mchlib::scantask::MediaType> media_type(new mchlib::scantask::MediaType((vm.count("type") ? vm["type"].as<char>() : 'O'), not vm.count("type"), search_path));
 	std::shared_ptr<mchlib::scantask::Hashing> hashing(new mchlib::scantask::Hashing(scan_dirtree, true));
 	const auto& hashes = hashing->get_or_create();
 	for (const auto& hash : hashes) {
 		std::cout << mchlib::tiger_to_string(hash.hash) << std::endl;
 	}
 	return 0;
 #if defined(WITH_MEDIA_AUTODETECT)
 	//char set_type;