1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2025-07-03 14:14:11 +00:00

Implement hashing task.

Also get rid of the ShorFileRecordData and put the new LeanBase
class to use.
This commit is contained in:
King_DuckZ 2016-03-08 08:48:12 +01:00
parent c64e572fc8
commit d2588d3c7e
9 changed files with 247 additions and 52 deletions

View file

@ -51,6 +51,24 @@ namespace mchlib {
{ {
} }
FileRecordData ( std::string&& parPath, std::size_t parRelPathOffs, std::time_t parATime, std::time_t parMTime, uint16_t parLevel, bool parIsDir, bool parIsSymLink ) :
hash {},
abs_path(std::move(parPath)),
mime_full(),
atime(parATime),
mtime(parMTime),
path(boost::string_ref(abs_path).substr(parRelPathOffs)),
mime_type(),
mime_charset(),
size(0),
level(parLevel),
is_directory(parIsDir),
is_symlink(parIsSymLink),
unreadable(false),
hash_valid(false)
{
}
#if defined(NDEBUG) #if defined(NDEBUG)
FileRecordData ( const FileRecordData& ) = delete; FileRecordData ( const FileRecordData& ) = delete;
#else #else
@ -79,16 +97,6 @@ namespace mchlib {
bool hash_valid; bool hash_valid;
}; };
struct ShortFileRecordData {
std::string abs_path;
std::string path;
std::time_t atime;
std::time_t mtime;
uint16_t level;
bool is_directory;
bool is_symlink;
};
struct SetRecordData { struct SetRecordData {
boost::string_ref name; boost::string_ref name;
char type; char type;

View file

@ -23,12 +23,12 @@
#include <vector> #include <vector>
namespace mchlib { namespace mchlib {
struct ShortFileRecordData; struct FileRecordData;
namespace scantask { namespace scantask {
class DirTree : public Base<std::vector<ShortFileRecordData>> { class DirTree : public Base<std::vector<FileRecordData>> {
public: public:
typedef std::vector<ShortFileRecordData> PathList; typedef std::vector<FileRecordData> PathList;
explicit DirTree ( std::string parRoot ); explicit DirTree ( std::string parRoot );
virtual ~DirTree ( void ) noexcept = default; virtual ~DirTree ( void ) noexcept = default;

View file

@ -0,0 +1,47 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idC7CC55298AC049EAA80604D6C7FD081D
#define idC7CC55298AC049EAA80604D6C7FD081D
#include "dindexer-machinery/scantask/leanbase.hpp"
#include "dindexer-machinery/tiger.hpp"
#include <vector>
#include <memory>
namespace mchlib {
struct FileRecordData;
namespace scantask {
class Hashing : public LeanBase<std::vector<FileRecordData>> {
public:
typedef LeanBase<std::vector<FileRecordData>> FileTreeBase;
Hashing ( std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors );
virtual ~Hashing ( void ) noexcept;
private:
virtual void on_data_fill ( void ) override;
virtual std::vector<FileRecordData>& on_data_get ( void ) override;
std::shared_ptr<FileTreeBase> m_file_tree_task;
bool m_ignore_errors;
};
} //namespace scantask
} //namespace mchlib
#endif

View file

@ -40,19 +40,21 @@ namespace mchlib {
template <bool Const> template <bool Const>
implem::DirIterator<Const> first_file ( SetListingView<Const>& parList ); implem::DirIterator<Const> first_file ( SetListingView<Const>& parList );
typedef FileRecordData SetListingItemType;
namespace implem { namespace implem {
template <bool Const> template <bool Const>
class DirIterator : public boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> { class DirIterator : public boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> {
friend class mchlib::SetListingView<Const>; friend class mchlib::SetListingView<Const>;
friend class boost::iterator_core_access; friend class boost::iterator_core_access;
template <bool> friend class DirIterator; template <bool> friend class DirIterator;
typedef boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> base_class; typedef boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> base_class;
struct enabler {}; struct enabler {};
public: public:
typedef typename std::conditional< typedef typename std::conditional<
Const, Const,
std::vector<mchlib::FileRecordData>::const_iterator, std::vector<SetListingItemType>::const_iterator,
std::vector<mchlib::FileRecordData>::iterator std::vector<SetListingItemType>::iterator
>::type VecIterator; >::type VecIterator;
typedef typename base_class::difference_type difference_type; typedef typename base_class::difference_type difference_type;
typedef typename base_class::value_type value_type; typedef typename base_class::value_type value_type;
@ -127,8 +129,7 @@ namespace mchlib {
class SetListing { class SetListing {
public: public:
typedef std::vector<FileRecordData> ListType; typedef std::vector<SetListingItemType> ListType;
typedef std::vector<ShortFileRecordData> ShortListType;
typedef implem::DirIterator<true> const_iterator; typedef implem::DirIterator<true> const_iterator;
explicit SetListing ( ListType&& parList, bool parSort=true ); explicit SetListing ( ListType&& parList, bool parSort=true );
@ -152,7 +153,6 @@ namespace mchlib {
static void sort_list ( ListType& parList ); static void sort_list ( ListType& parList );
static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir ); static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
static ShortListType::iterator lower_bound ( ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
private: private:
ListType m_list; ListType m_list;

View file

@ -18,6 +18,7 @@ add_library(${PROJECT_NAME} SHARED
globbing.cpp globbing.cpp
scantask/dirtree.cpp scantask/dirtree.cpp
scantask/mediatype.cpp scantask/mediatype.cpp
scantask/hashing.cpp
) )
#target_include_directories(${PROJECT_NAME} #target_include_directories(${PROJECT_NAME}

View file

@ -18,6 +18,7 @@
#include "dindexer-machinery/scantask/dirtree.hpp" #include "dindexer-machinery/scantask/dirtree.hpp"
#include "dindexer-machinery/recorddata.hpp" #include "dindexer-machinery/recorddata.hpp"
#include "dindexer-machinery/set_listing.hpp" #include "dindexer-machinery/set_listing.hpp"
#include "helpers/compatibility.h"
#include "filesearcher.hpp" #include "filesearcher.hpp"
#include "pathname.hpp" #include "pathname.hpp"
#include <utility> #include <utility>
@ -28,6 +29,17 @@
namespace mchlib { namespace mchlib {
namespace { namespace {
std::size_t calc_rel_path_offs ( const PathName& parRoot, boost::string_ref parPath ) a_pure;
std::size_t calc_rel_path_offs (const PathName& parRoot, boost::string_ref parPath) {
PathName path(parPath);
PathName rel_path = make_relative_path(parRoot, path);
const auto rel_path_len = rel_path.str_path_size();
const auto path_len = path.str_path_size();
assert(rel_path_len <= path_len);
return path_len - rel_path_len;
}
bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) { bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) {
using boost::string_ref; using boost::string_ref;
@ -48,15 +60,15 @@ namespace mchlib {
parOut.insert( parOut.insert(
it_before, it_before,
ShortFileRecordData { FileRecordData(
std::string(parPath), parPath,
make_relative_path(parRoot, PathName(string_ref(parPath))).path(), calc_rel_path_offs(parRoot, string_ref(parPath)),
parStats.atime, parStats.atime,
parStats.mtime, parStats.mtime,
static_cast<uint16_t>(parStats.level), static_cast<uint16_t>(parStats.level),
static_cast<bool>(parStats.is_dir), static_cast<bool>(parStats.is_dir),
static_cast<bool>(parStats.is_symlink) static_cast<bool>(parStats.is_symlink)
} )
); );
return true; return true;
} }

View file

@ -0,0 +1,124 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#include "dindexer-machinery/scantask/hashing.hpp"
#include "dindexer-machinery/recorddata.hpp"
#include "dindexer-machinery/set_listing.hpp"
#include "pathname.hpp"
#include <cassert>
#include <boost/range/empty.hpp>
#include <boost/utility/string_ref.hpp>
namespace mchlib {
namespace {
void append_to_vec (std::vector<char>& parDest, const TigerHash& parHash, boost::string_ref parString) {
const auto old_size = parDest.size();
parDest.resize(old_size + sizeof(TigerHash) + parString.size());
std::copy(parHash.byte_data, parHash.byte_data + sizeof(TigerHash), parDest.begin() + old_size);
std::copy(parString.begin(), parString.end(), parDest.begin() + old_size + sizeof(TigerHash));
}
void append_to_vec (std::vector<char>& parDest, boost::string_ref parString) {
const auto old_size = parDest.size();
parDest.resize(old_size + parString.size());
std::copy(parString.begin(), parString.end(), parDest.begin() + old_size);
}
void hash_dir (FileRecordData& parEntry, MutableSetListingView& parList, bool parIgnoreErrors) {
assert(parEntry.is_directory);
//Build a blob with the hashes and filenames of every directory that
//is a direct child of current entry
std::vector<char> dir_blob;
#if defined(INDEXER_VERBOSE)
std::cout << "Making initial hash for " << parCurrDir << "...\n";
#endif
for (auto it = parList.begin(); it != parList.end(); ++it) {
assert(PathName(parEntry.abs_path) == PathName(it->abs_path).pop_right());
if (it->is_directory) {
auto cd_list = MutableSetListingView(it);
assert(boost::empty(cd_list) or cd_list.begin()->abs_path != it->abs_path);
hash_dir(*it, cd_list, parIgnoreErrors);
append_to_vec(dir_blob, it->hash, it->path);
}
else {
append_to_vec(dir_blob, it->path);
}
}
tiger_data(dir_blob, parEntry.hash);
#if defined(INDEXER_VERBOSE)
std::cout << "Got intermediate hash for dir " << parCurrDir <<
": " << tiger_to_string(parEntry.hash) <<
' ' << parEntry.mime_type << '\n';
#endif
//Now with the initial hash ready, let's start hashing files, if any
for (auto it = first_file(parList); it != parList.end(); ++it) {
assert(not it->is_directory);
#if defined(INDEXER_VERBOSE)
std::cout << "Hashing file " << it->abs_path << "...";
#endif
//TODO: notify callback
try {
tiger_file(it->abs_path, it->hash, parEntry.hash, it->size);
it->hash_valid = true;
}
catch (const std::ios_base::failure& e) {
if (parIgnoreErrors) {
it->unreadable = true;
it->hash = TigerHash {};
}
else {
throw e;
}
}
}
#if defined(INDEXER_VERBOSE)
std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry.hash) << '\n';
#endif
parEntry.hash_valid = true;
}
} //unnamed namespace
namespace scantask {
Hashing::Hashing (std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors) :
m_file_tree_task(parFileTree),
m_ignore_errors(parIgnoreErrors)
{
assert(m_file_tree_task);
}
Hashing::~Hashing() noexcept {
}
std::vector<FileRecordData>& Hashing::on_data_get() {
return m_file_tree_task->get_or_create();
}
void Hashing::on_data_fill() {
std::vector<FileRecordData>& file_list = m_file_tree_task->get_or_create();
MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0);
hash_dir(file_list.front(), recordlist, m_ignore_errors);
}
} //namespace scantask
} //namespace mchlib

View file

@ -29,26 +29,26 @@ namespace mchlib {
//to be made. //to be made.
struct FileRecordDataForSearch { struct FileRecordDataForSearch {
FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) : FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) :
abs_path(parPath), path(parPath),
level(parLevel), level(parLevel),
is_directory(parIsDir) is_directory(parIsDir)
{ {
assert(parPath); assert(parPath);
} }
boost::string_ref abs_path; boost::string_ref path;
uint16_t level; uint16_t level;
bool is_directory; bool is_directory;
}; };
template <typename RecordType, typename OtherRecord> template <typename OtherRecord>
bool file_record_data_lt (const RecordType& parLeft, const OtherRecord& parRight) { bool file_record_data_lt (const SetListingItemType& parLeft, const OtherRecord& parRight) {
const RecordType& l = parLeft; const SetListingItemType& l = parLeft;
const OtherRecord& r = parRight; const OtherRecord& r = parRight;
return return
(l.level < r.level) (l.level < r.level)
or (l.level == r.level and l.is_directory and not r.is_directory) or (l.level == r.level and l.is_directory and not r.is_directory)
or (l.level == r.level and l.is_directory == r.is_directory and l.abs_path < r.abs_path) or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path)
//sort by directory - parent first, children later //sort by directory - parent first, children later
//(level == o.level and is_dir and not o.is_dir) //(level == o.level and is_dir and not o.is_dir)
@ -99,14 +99,14 @@ namespace mchlib {
{ {
assert(parBasePath); assert(parBasePath);
assert(m_base_path or m_current == m_end); assert(m_base_path or m_current == m_end);
assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->abs_path).atom_count()); assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->path).atom_count());
assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset); assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset);
//Look for the point where the children of this entry start //Look for the point where the children of this entry start
while ( while (
m_current != m_end and ( m_current != m_end and (
m_current->level + m_level_offset == m_base_path->atom_count() or m_current->level + m_level_offset == m_base_path->atom_count() or
*m_base_path != PathName(m_current->abs_path).pop_right() *m_base_path != PathName(m_current->path).pop_right()
)) { )) {
assert(m_base_path); assert(m_base_path);
++m_current; ++m_current;
@ -157,13 +157,13 @@ namespace mchlib {
template <bool Const> template <bool Const>
void DirIterator<Const>::increment() { void DirIterator<Const>::increment() {
assert(PathName(m_current->abs_path).pop_right() == *m_base_path); assert(PathName(m_current->path).pop_right() == *m_base_path);
do { do {
++m_current; ++m_current;
} while( } while(
m_current != m_end and m_current != m_end and
m_current->level + m_level_offset == m_base_path->atom_count() + 1 and m_current->level + m_level_offset == m_base_path->atom_count() + 1 and
*m_base_path != PathName(m_current->abs_path).pop_right() *m_base_path != PathName(m_current->path).pop_right()
); );
} }
@ -222,7 +222,7 @@ namespace mchlib {
assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin())); assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin()));
} }
if (not m_list.empty()) { if (not m_list.empty()) {
m_base_path.reset(new PathName(m_list.front().abs_path)); m_base_path.reset(new PathName(m_list.front().path));
} }
} }
@ -258,7 +258,7 @@ namespace mchlib {
return std::count_if( return std::count_if(
m_list.begin(), m_list.begin(),
m_list.end(), m_list.end(),
[] (const FileRecordData& parItm) { [] (const SetListingItemType& parItm) {
return not parItm.is_directory; return not parItm.is_directory;
} }
); );
@ -268,7 +268,7 @@ namespace mchlib {
return std::count_if( return std::count_if(
m_list.begin(), m_list.begin(),
m_list.end(), m_list.end(),
[] (const FileRecordData& parItm) { [] (const SetListingItemType& parItm) {
return parItm.is_directory; return parItm.is_directory;
} }
); );
@ -279,33 +279,27 @@ namespace mchlib {
} }
void SetListing::sort_list (ListType& parList) { void SetListing::sort_list (ListType& parList) {
std::sort(parList.begin(), parList.end(), &file_record_data_lt<FileRecordData, FileRecordData>); std::sort(parList.begin(), parList.end(), &file_record_data_lt<SetListingItemType>);
} }
SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) { SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
using boost::string_ref; using boost::string_ref;
FileRecordDataForSearch find_record(parPath, parLevel, parIsDir); FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordData, FileRecordDataForSearch>); return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordDataForSearch>);
}
SetListing::ShortListType::iterator SetListing::lower_bound (ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
using boost::string_ref;
FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<ShortFileRecordData, FileRecordDataForSearch>);
} }
SetListingView<false> SetListing::make_view() { SetListingView<false> SetListing::make_view() {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<false>(m_list.begin(), m_list.end(), offs, m_base_path); return SetListingView<false>(m_list.begin(), m_list.end(), offs, m_base_path);
} }
SetListingView<true> SetListing::make_view() const { SetListingView<true> SetListing::make_view() const {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path); return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
} }
SetListingView<true> SetListing::make_cview() const { SetListingView<true> SetListing::make_cview() const {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count()); const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path); return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
} }
@ -317,7 +311,7 @@ namespace mchlib {
m_level_offset(parIter.m_level_offset) m_level_offset(parIter.m_level_offset)
{ {
if (m_begin != m_end) { if (m_begin != m_end) {
m_base_path.reset(new PathName(m_begin->abs_path)); m_base_path.reset(new PathName(m_begin->path));
} }
} }
@ -329,7 +323,7 @@ namespace mchlib {
m_level_offset(parLevelOffset) m_level_offset(parLevelOffset)
{ {
if (m_begin != m_end) { if (m_begin != m_end) {
m_base_path.reset(new PathName(m_begin->abs_path)); m_base_path.reset(new PathName(m_begin->path));
} }
} }

View file

@ -28,6 +28,7 @@
#include "dbbackend.hpp" #include "dbbackend.hpp"
#include "dindexer-machinery/scantask/dirtree.hpp" #include "dindexer-machinery/scantask/dirtree.hpp"
#include "dindexer-machinery/scantask/mediatype.hpp" #include "dindexer-machinery/scantask/mediatype.hpp"
#include "dindexer-machinery/scantask/hashing.hpp"
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <ciso646> #include <ciso646>
@ -76,8 +77,16 @@ int main (int parArgc, char* parArgv[]) {
} }
const std::string search_path(vm["search-path"].as<std::string>()); const std::string search_path(vm["search-path"].as<std::string>());
mchlib::scantask::DirTree scan_dirtree(search_path); std::shared_ptr<mchlib::scantask::DirTree> scan_dirtree(new mchlib::scantask::DirTree(search_path));
mchlib::scantask::MediaType media_type(vm["type"].as<char>(), not vm.count("type"), search_path); std::shared_ptr<mchlib::scantask::MediaType> media_type(new mchlib::scantask::MediaType((vm.count("type") ? vm["type"].as<char>() : 'O'), not vm.count("type"), search_path));
std::shared_ptr<mchlib::scantask::Hashing> hashing(new mchlib::scantask::Hashing(scan_dirtree, true));
const auto& hashes = hashing->get_or_create();
for (const auto& hash : hashes) {
std::cout << mchlib::tiger_to_string(hash.hash) << std::endl;
}
return 0;
#if defined(WITH_MEDIA_AUTODETECT) #if defined(WITH_MEDIA_AUTODETECT)
//char set_type; //char set_type;