diff --git a/include/dindexer-machinery/recorddata.hpp b/include/dindexer-machinery/recorddata.hpp index 8b81ac1..59b1845 100644 --- a/include/dindexer-machinery/recorddata.hpp +++ b/include/dindexer-machinery/recorddata.hpp @@ -51,6 +51,24 @@ namespace mchlib { { } + FileRecordData ( std::string&& parPath, std::size_t parRelPathOffs, std::time_t parATime, std::time_t parMTime, uint16_t parLevel, bool parIsDir, bool parIsSymLink ) : + hash {}, + abs_path(std::move(parPath)), + mime_full(), + atime(parATime), + mtime(parMTime), + path(boost::string_ref(abs_path).substr(parRelPathOffs)), + mime_type(), + mime_charset(), + size(0), + level(parLevel), + is_directory(parIsDir), + is_symlink(parIsSymLink), + unreadable(false), + hash_valid(false) + { + } + #if defined(NDEBUG) FileRecordData ( const FileRecordData& ) = delete; #else @@ -79,16 +97,6 @@ namespace mchlib { bool hash_valid; }; - struct ShortFileRecordData { - std::string abs_path; - std::string path; - std::time_t atime; - std::time_t mtime; - uint16_t level; - bool is_directory; - bool is_symlink; - }; - struct SetRecordData { boost::string_ref name; char type; diff --git a/include/dindexer-machinery/scantask/dirtree.hpp b/include/dindexer-machinery/scantask/dirtree.hpp index 8e52dd3..d31b2e7 100644 --- a/include/dindexer-machinery/scantask/dirtree.hpp +++ b/include/dindexer-machinery/scantask/dirtree.hpp @@ -23,12 +23,12 @@ #include namespace mchlib { - struct ShortFileRecordData; + struct FileRecordData; namespace scantask { - class DirTree : public Base> { + class DirTree : public Base> { public: - typedef std::vector PathList; + typedef std::vector PathList; explicit DirTree ( std::string parRoot ); virtual ~DirTree ( void ) noexcept = default; diff --git a/include/dindexer-machinery/scantask/hashing.hpp b/include/dindexer-machinery/scantask/hashing.hpp new file mode 100644 index 0000000..fe1e9b4 --- /dev/null +++ b/include/dindexer-machinery/scantask/hashing.hpp @@ -0,0 +1,47 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#ifndef idC7CC55298AC049EAA80604D6C7FD081D +#define idC7CC55298AC049EAA80604D6C7FD081D + +#include "dindexer-machinery/scantask/leanbase.hpp" +#include "dindexer-machinery/tiger.hpp" +#include +#include + +namespace mchlib { + struct FileRecordData; + + namespace scantask { + class Hashing : public LeanBase> { + public: + typedef LeanBase> FileTreeBase; + + Hashing ( std::shared_ptr parFileTree, bool parIgnoreErrors ); + virtual ~Hashing ( void ) noexcept; + + private: + virtual void on_data_fill ( void ) override; + virtual std::vector& on_data_get ( void ) override; + + std::shared_ptr m_file_tree_task; + bool m_ignore_errors; + }; + } //namespace scantask +} //namespace mchlib + +#endif diff --git a/include/dindexer-machinery/set_listing.hpp b/include/dindexer-machinery/set_listing.hpp index c97560e..80991fc 100644 --- a/include/dindexer-machinery/set_listing.hpp +++ b/include/dindexer-machinery/set_listing.hpp @@ -40,19 +40,21 @@ namespace mchlib { template implem::DirIterator first_file ( SetListingView& parList ); + typedef FileRecordData SetListingItemType; + namespace implem { template - class DirIterator : public boost::iterator_facade, FileRecordData, boost::forward_traversal_tag> { + class DirIterator : public boost::iterator_facade, SetListingItemType, boost::forward_traversal_tag> { friend class mchlib::SetListingView; friend class boost::iterator_core_access; template friend class DirIterator; - typedef boost::iterator_facade, FileRecordData, boost::forward_traversal_tag> base_class; + typedef boost::iterator_facade, SetListingItemType, boost::forward_traversal_tag> base_class; struct enabler {}; public: typedef typename std::conditional< Const, - std::vector::const_iterator, - std::vector::iterator + std::vector::const_iterator, + std::vector::iterator >::type VecIterator; typedef typename base_class::difference_type difference_type; typedef typename base_class::value_type value_type; @@ -127,8 +129,7 @@ namespace mchlib { class SetListing { public: - typedef std::vector ListType; - typedef std::vector ShortListType; + typedef std::vector ListType; typedef implem::DirIterator const_iterator; explicit SetListing ( ListType&& parList, bool parSort=true ); @@ -152,7 +153,6 @@ namespace mchlib { static void sort_list ( ListType& parList ); static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir ); - static ShortListType::iterator lower_bound ( ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir ); private: ListType m_list; diff --git a/src/machinery/CMakeLists.txt b/src/machinery/CMakeLists.txt index 7482548..052d42a 100644 --- a/src/machinery/CMakeLists.txt +++ b/src/machinery/CMakeLists.txt @@ -18,6 +18,7 @@ add_library(${PROJECT_NAME} SHARED globbing.cpp scantask/dirtree.cpp scantask/mediatype.cpp + scantask/hashing.cpp ) #target_include_directories(${PROJECT_NAME} diff --git a/src/machinery/scantask/dirtree.cpp b/src/machinery/scantask/dirtree.cpp index 592216e..4cd3764 100644 --- a/src/machinery/scantask/dirtree.cpp +++ b/src/machinery/scantask/dirtree.cpp @@ -18,6 +18,7 @@ #include "dindexer-machinery/scantask/dirtree.hpp" #include "dindexer-machinery/recorddata.hpp" #include "dindexer-machinery/set_listing.hpp" +#include "helpers/compatibility.h" #include "filesearcher.hpp" #include "pathname.hpp" #include @@ -28,6 +29,17 @@ namespace mchlib { namespace { + std::size_t calc_rel_path_offs ( const PathName& parRoot, boost::string_ref parPath ) a_pure; + + std::size_t calc_rel_path_offs (const PathName& parRoot, boost::string_ref parPath) { + PathName path(parPath); + PathName rel_path = make_relative_path(parRoot, path); + const auto rel_path_len = rel_path.str_path_size(); + const auto path_len = path.str_path_size(); + assert(rel_path_len <= path_len); + return path_len - rel_path_len; + } + bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) { using boost::string_ref; @@ -48,15 +60,15 @@ namespace mchlib { parOut.insert( it_before, - ShortFileRecordData { - std::string(parPath), - make_relative_path(parRoot, PathName(string_ref(parPath))).path(), + FileRecordData( + parPath, + calc_rel_path_offs(parRoot, string_ref(parPath)), parStats.atime, parStats.mtime, static_cast(parStats.level), static_cast(parStats.is_dir), static_cast(parStats.is_symlink) - } + ) ); return true; } diff --git a/src/machinery/scantask/hashing.cpp b/src/machinery/scantask/hashing.cpp new file mode 100644 index 0000000..cbe449f --- /dev/null +++ b/src/machinery/scantask/hashing.cpp @@ -0,0 +1,124 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#include "dindexer-machinery/scantask/hashing.hpp" +#include "dindexer-machinery/recorddata.hpp" +#include "dindexer-machinery/set_listing.hpp" +#include "pathname.hpp" +#include +#include +#include + +namespace mchlib { + namespace { + + void append_to_vec (std::vector& parDest, const TigerHash& parHash, boost::string_ref parString) { + const auto old_size = parDest.size(); + parDest.resize(old_size + sizeof(TigerHash) + parString.size()); + std::copy(parHash.byte_data, parHash.byte_data + sizeof(TigerHash), parDest.begin() + old_size); + std::copy(parString.begin(), parString.end(), parDest.begin() + old_size + sizeof(TigerHash)); + } + + void append_to_vec (std::vector& parDest, boost::string_ref parString) { + const auto old_size = parDest.size(); + parDest.resize(old_size + parString.size()); + std::copy(parString.begin(), parString.end(), parDest.begin() + old_size); + } + + void hash_dir (FileRecordData& parEntry, MutableSetListingView& parList, bool parIgnoreErrors) { + assert(parEntry.is_directory); + + //Build a blob with the hashes and filenames of every directory that + //is a direct child of current entry + std::vector dir_blob; +#if defined(INDEXER_VERBOSE) + std::cout << "Making initial hash for " << parCurrDir << "...\n"; +#endif + for (auto it = parList.begin(); it != parList.end(); ++it) { + assert(PathName(parEntry.abs_path) == PathName(it->abs_path).pop_right()); + + if (it->is_directory) { + auto cd_list = MutableSetListingView(it); + assert(boost::empty(cd_list) or cd_list.begin()->abs_path != it->abs_path); + + hash_dir(*it, cd_list, parIgnoreErrors); + append_to_vec(dir_blob, it->hash, it->path); + } + else { + append_to_vec(dir_blob, it->path); + } + } + tiger_data(dir_blob, parEntry.hash); + +#if defined(INDEXER_VERBOSE) + std::cout << "Got intermediate hash for dir " << parCurrDir << + ": " << tiger_to_string(parEntry.hash) << + ' ' << parEntry.mime_type << '\n'; +#endif + + //Now with the initial hash ready, let's start hashing files, if any + for (auto it = first_file(parList); it != parList.end(); ++it) { + assert(not it->is_directory); +#if defined(INDEXER_VERBOSE) + std::cout << "Hashing file " << it->abs_path << "..."; +#endif + //TODO: notify callback + try { + tiger_file(it->abs_path, it->hash, parEntry.hash, it->size); + it->hash_valid = true; + } + catch (const std::ios_base::failure& e) { + if (parIgnoreErrors) { + it->unreadable = true; + it->hash = TigerHash {}; + } + else { + throw e; + } + } + } + +#if defined(INDEXER_VERBOSE) + std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry.hash) << '\n'; +#endif + parEntry.hash_valid = true; + } + } //unnamed namespace + + namespace scantask { + Hashing::Hashing (std::shared_ptr parFileTree, bool parIgnoreErrors) : + m_file_tree_task(parFileTree), + m_ignore_errors(parIgnoreErrors) + { + assert(m_file_tree_task); + } + + Hashing::~Hashing() noexcept { + } + + std::vector& Hashing::on_data_get() { + return m_file_tree_task->get_or_create(); + } + + void Hashing::on_data_fill() { + std::vector& file_list = m_file_tree_task->get_or_create(); + + MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0); + hash_dir(file_list.front(), recordlist, m_ignore_errors); + } + } //namespace scantask +} //namespace mchlib diff --git a/src/machinery/set_listing.cpp b/src/machinery/set_listing.cpp index 5c3304e..e4f4c87 100644 --- a/src/machinery/set_listing.cpp +++ b/src/machinery/set_listing.cpp @@ -29,26 +29,26 @@ namespace mchlib { //to be made. struct FileRecordDataForSearch { FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) : - abs_path(parPath), + path(parPath), level(parLevel), is_directory(parIsDir) { assert(parPath); } - boost::string_ref abs_path; + boost::string_ref path; uint16_t level; bool is_directory; }; - template - bool file_record_data_lt (const RecordType& parLeft, const OtherRecord& parRight) { - const RecordType& l = parLeft; + template + bool file_record_data_lt (const SetListingItemType& parLeft, const OtherRecord& parRight) { + const SetListingItemType& l = parLeft; const OtherRecord& r = parRight; return (l.level < r.level) or (l.level == r.level and l.is_directory and not r.is_directory) - or (l.level == r.level and l.is_directory == r.is_directory and l.abs_path < r.abs_path) + or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path) //sort by directory - parent first, children later //(level == o.level and is_dir and not o.is_dir) @@ -99,14 +99,14 @@ namespace mchlib { { assert(parBasePath); assert(m_base_path or m_current == m_end); - assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->abs_path).atom_count()); + assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->path).atom_count()); assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset); //Look for the point where the children of this entry start while ( m_current != m_end and ( m_current->level + m_level_offset == m_base_path->atom_count() or - *m_base_path != PathName(m_current->abs_path).pop_right() + *m_base_path != PathName(m_current->path).pop_right() )) { assert(m_base_path); ++m_current; @@ -157,13 +157,13 @@ namespace mchlib { template void DirIterator::increment() { - assert(PathName(m_current->abs_path).pop_right() == *m_base_path); + assert(PathName(m_current->path).pop_right() == *m_base_path); do { ++m_current; } while( m_current != m_end and m_current->level + m_level_offset == m_base_path->atom_count() + 1 and - *m_base_path != PathName(m_current->abs_path).pop_right() + *m_base_path != PathName(m_current->path).pop_right() ); } @@ -222,7 +222,7 @@ namespace mchlib { assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin())); } if (not m_list.empty()) { - m_base_path.reset(new PathName(m_list.front().abs_path)); + m_base_path.reset(new PathName(m_list.front().path)); } } @@ -258,7 +258,7 @@ namespace mchlib { return std::count_if( m_list.begin(), m_list.end(), - [] (const FileRecordData& parItm) { + [] (const SetListingItemType& parItm) { return not parItm.is_directory; } ); @@ -268,7 +268,7 @@ namespace mchlib { return std::count_if( m_list.begin(), m_list.end(), - [] (const FileRecordData& parItm) { + [] (const SetListingItemType& parItm) { return parItm.is_directory; } ); @@ -279,33 +279,27 @@ namespace mchlib { } void SetListing::sort_list (ListType& parList) { - std::sort(parList.begin(), parList.end(), &file_record_data_lt); + std::sort(parList.begin(), parList.end(), &file_record_data_lt); } SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) { using boost::string_ref; FileRecordDataForSearch find_record(parPath, parLevel, parIsDir); - return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt); - } - - SetListing::ShortListType::iterator SetListing::lower_bound (ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) { - using boost::string_ref; - FileRecordDataForSearch find_record(parPath, parLevel, parIsDir); - return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt); + return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt); } SetListingView SetListing::make_view() { - const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); + const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count()); return SetListingView(m_list.begin(), m_list.end(), offs, m_base_path); } SetListingView SetListing::make_view() const { - const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); + const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count()); return SetListingView(m_list.begin(), m_list.end(), offs, m_base_path); } SetListingView SetListing::make_cview() const { - const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); + const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count()); return SetListingView(m_list.begin(), m_list.end(), offs, m_base_path); } @@ -317,7 +311,7 @@ namespace mchlib { m_level_offset(parIter.m_level_offset) { if (m_begin != m_end) { - m_base_path.reset(new PathName(m_begin->abs_path)); + m_base_path.reset(new PathName(m_begin->path)); } } @@ -329,7 +323,7 @@ namespace mchlib { m_level_offset(parLevelOffset) { if (m_begin != m_end) { - m_base_path.reset(new PathName(m_begin->abs_path)); + m_base_path.reset(new PathName(m_begin->path)); } } diff --git a/src/scan/main.cpp b/src/scan/main.cpp index 777f322..3c4c33a 100644 --- a/src/scan/main.cpp +++ b/src/scan/main.cpp @@ -28,6 +28,7 @@ #include "dbbackend.hpp" #include "dindexer-machinery/scantask/dirtree.hpp" #include "dindexer-machinery/scantask/mediatype.hpp" +#include "dindexer-machinery/scantask/hashing.hpp" #include #include #include @@ -76,8 +77,16 @@ int main (int parArgc, char* parArgv[]) { } const std::string search_path(vm["search-path"].as()); - mchlib::scantask::DirTree scan_dirtree(search_path); - mchlib::scantask::MediaType media_type(vm["type"].as(), not vm.count("type"), search_path); + std::shared_ptr scan_dirtree(new mchlib::scantask::DirTree(search_path)); + std::shared_ptr media_type(new mchlib::scantask::MediaType((vm.count("type") ? vm["type"].as() : 'O'), not vm.count("type"), search_path)); + std::shared_ptr hashing(new mchlib::scantask::Hashing(scan_dirtree, true)); + + const auto& hashes = hashing->get_or_create(); + for (const auto& hash : hashes) { + std::cout << mchlib::tiger_to_string(hash.hash) << std::endl; + } + + return 0; #if defined(WITH_MEDIA_AUTODETECT) //char set_type;