1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2024-11-29 01:33:46 +00:00

Implement hashing task.

Also get rid of the ShorFileRecordData and put the new LeanBase
class to use.
This commit is contained in:
King_DuckZ 2016-03-08 08:48:12 +01:00
parent c64e572fc8
commit d2588d3c7e
9 changed files with 247 additions and 52 deletions

View file

@ -51,6 +51,24 @@ namespace mchlib {
{
}
FileRecordData ( std::string&& parPath, std::size_t parRelPathOffs, std::time_t parATime, std::time_t parMTime, uint16_t parLevel, bool parIsDir, bool parIsSymLink ) :
hash {},
abs_path(std::move(parPath)),
mime_full(),
atime(parATime),
mtime(parMTime),
path(boost::string_ref(abs_path).substr(parRelPathOffs)),
mime_type(),
mime_charset(),
size(0),
level(parLevel),
is_directory(parIsDir),
is_symlink(parIsSymLink),
unreadable(false),
hash_valid(false)
{
}
#if defined(NDEBUG)
FileRecordData ( const FileRecordData& ) = delete;
#else
@ -79,16 +97,6 @@ namespace mchlib {
bool hash_valid;
};
struct ShortFileRecordData {
std::string abs_path;
std::string path;
std::time_t atime;
std::time_t mtime;
uint16_t level;
bool is_directory;
bool is_symlink;
};
struct SetRecordData {
boost::string_ref name;
char type;

View file

@ -23,12 +23,12 @@
#include <vector>
namespace mchlib {
struct ShortFileRecordData;
struct FileRecordData;
namespace scantask {
class DirTree : public Base<std::vector<ShortFileRecordData>> {
class DirTree : public Base<std::vector<FileRecordData>> {
public:
typedef std::vector<ShortFileRecordData> PathList;
typedef std::vector<FileRecordData> PathList;
explicit DirTree ( std::string parRoot );
virtual ~DirTree ( void ) noexcept = default;

View file

@ -0,0 +1,47 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idC7CC55298AC049EAA80604D6C7FD081D
#define idC7CC55298AC049EAA80604D6C7FD081D
#include "dindexer-machinery/scantask/leanbase.hpp"
#include "dindexer-machinery/tiger.hpp"
#include <vector>
#include <memory>
namespace mchlib {
struct FileRecordData;
namespace scantask {
class Hashing : public LeanBase<std::vector<FileRecordData>> {
public:
typedef LeanBase<std::vector<FileRecordData>> FileTreeBase;
Hashing ( std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors );
virtual ~Hashing ( void ) noexcept;
private:
virtual void on_data_fill ( void ) override;
virtual std::vector<FileRecordData>& on_data_get ( void ) override;
std::shared_ptr<FileTreeBase> m_file_tree_task;
bool m_ignore_errors;
};
} //namespace scantask
} //namespace mchlib
#endif

View file

@ -40,19 +40,21 @@ namespace mchlib {
template <bool Const>
implem::DirIterator<Const> first_file ( SetListingView<Const>& parList );
typedef FileRecordData SetListingItemType;
namespace implem {
template <bool Const>
class DirIterator : public boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> {
class DirIterator : public boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> {
friend class mchlib::SetListingView<Const>;
friend class boost::iterator_core_access;
template <bool> friend class DirIterator;
typedef boost::iterator_facade<DirIterator<Const>, FileRecordData, boost::forward_traversal_tag> base_class;
typedef boost::iterator_facade<DirIterator<Const>, SetListingItemType, boost::forward_traversal_tag> base_class;
struct enabler {};
public:
typedef typename std::conditional<
Const,
std::vector<mchlib::FileRecordData>::const_iterator,
std::vector<mchlib::FileRecordData>::iterator
std::vector<SetListingItemType>::const_iterator,
std::vector<SetListingItemType>::iterator
>::type VecIterator;
typedef typename base_class::difference_type difference_type;
typedef typename base_class::value_type value_type;
@ -127,8 +129,7 @@ namespace mchlib {
class SetListing {
public:
typedef std::vector<FileRecordData> ListType;
typedef std::vector<ShortFileRecordData> ShortListType;
typedef std::vector<SetListingItemType> ListType;
typedef implem::DirIterator<true> const_iterator;
explicit SetListing ( ListType&& parList, bool parSort=true );
@ -152,7 +153,6 @@ namespace mchlib {
static void sort_list ( ListType& parList );
static ListType::iterator lower_bound ( ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
static ShortListType::iterator lower_bound ( ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir );
private:
ListType m_list;

View file

@ -18,6 +18,7 @@ add_library(${PROJECT_NAME} SHARED
globbing.cpp
scantask/dirtree.cpp
scantask/mediatype.cpp
scantask/hashing.cpp
)
#target_include_directories(${PROJECT_NAME}

View file

@ -18,6 +18,7 @@
#include "dindexer-machinery/scantask/dirtree.hpp"
#include "dindexer-machinery/recorddata.hpp"
#include "dindexer-machinery/set_listing.hpp"
#include "helpers/compatibility.h"
#include "filesearcher.hpp"
#include "pathname.hpp"
#include <utility>
@ -28,6 +29,17 @@
namespace mchlib {
namespace {
std::size_t calc_rel_path_offs ( const PathName& parRoot, boost::string_ref parPath ) a_pure;
std::size_t calc_rel_path_offs (const PathName& parRoot, boost::string_ref parPath) {
PathName path(parPath);
PathName rel_path = make_relative_path(parRoot, path);
const auto rel_path_len = rel_path.str_path_size();
const auto path_len = path.str_path_size();
assert(rel_path_len <= path_len);
return path_len - rel_path_len;
}
bool add_path (scantask::DirTree::PathList& parOut, const PathName& parRoot, const char* parPath, const fastf::FileStats& parStats) {
using boost::string_ref;
@ -48,15 +60,15 @@ namespace mchlib {
parOut.insert(
it_before,
ShortFileRecordData {
std::string(parPath),
make_relative_path(parRoot, PathName(string_ref(parPath))).path(),
FileRecordData(
parPath,
calc_rel_path_offs(parRoot, string_ref(parPath)),
parStats.atime,
parStats.mtime,
static_cast<uint16_t>(parStats.level),
static_cast<bool>(parStats.is_dir),
static_cast<bool>(parStats.is_symlink)
}
)
);
return true;
}

View file

@ -0,0 +1,124 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#include "dindexer-machinery/scantask/hashing.hpp"
#include "dindexer-machinery/recorddata.hpp"
#include "dindexer-machinery/set_listing.hpp"
#include "pathname.hpp"
#include <cassert>
#include <boost/range/empty.hpp>
#include <boost/utility/string_ref.hpp>
namespace mchlib {
namespace {
void append_to_vec (std::vector<char>& parDest, const TigerHash& parHash, boost::string_ref parString) {
const auto old_size = parDest.size();
parDest.resize(old_size + sizeof(TigerHash) + parString.size());
std::copy(parHash.byte_data, parHash.byte_data + sizeof(TigerHash), parDest.begin() + old_size);
std::copy(parString.begin(), parString.end(), parDest.begin() + old_size + sizeof(TigerHash));
}
void append_to_vec (std::vector<char>& parDest, boost::string_ref parString) {
const auto old_size = parDest.size();
parDest.resize(old_size + parString.size());
std::copy(parString.begin(), parString.end(), parDest.begin() + old_size);
}
void hash_dir (FileRecordData& parEntry, MutableSetListingView& parList, bool parIgnoreErrors) {
assert(parEntry.is_directory);
//Build a blob with the hashes and filenames of every directory that
//is a direct child of current entry
std::vector<char> dir_blob;
#if defined(INDEXER_VERBOSE)
std::cout << "Making initial hash for " << parCurrDir << "...\n";
#endif
for (auto it = parList.begin(); it != parList.end(); ++it) {
assert(PathName(parEntry.abs_path) == PathName(it->abs_path).pop_right());
if (it->is_directory) {
auto cd_list = MutableSetListingView(it);
assert(boost::empty(cd_list) or cd_list.begin()->abs_path != it->abs_path);
hash_dir(*it, cd_list, parIgnoreErrors);
append_to_vec(dir_blob, it->hash, it->path);
}
else {
append_to_vec(dir_blob, it->path);
}
}
tiger_data(dir_blob, parEntry.hash);
#if defined(INDEXER_VERBOSE)
std::cout << "Got intermediate hash for dir " << parCurrDir <<
": " << tiger_to_string(parEntry.hash) <<
' ' << parEntry.mime_type << '\n';
#endif
//Now with the initial hash ready, let's start hashing files, if any
for (auto it = first_file(parList); it != parList.end(); ++it) {
assert(not it->is_directory);
#if defined(INDEXER_VERBOSE)
std::cout << "Hashing file " << it->abs_path << "...";
#endif
//TODO: notify callback
try {
tiger_file(it->abs_path, it->hash, parEntry.hash, it->size);
it->hash_valid = true;
}
catch (const std::ios_base::failure& e) {
if (parIgnoreErrors) {
it->unreadable = true;
it->hash = TigerHash {};
}
else {
throw e;
}
}
}
#if defined(INDEXER_VERBOSE)
std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry.hash) << '\n';
#endif
parEntry.hash_valid = true;
}
} //unnamed namespace
namespace scantask {
Hashing::Hashing (std::shared_ptr<FileTreeBase> parFileTree, bool parIgnoreErrors) :
m_file_tree_task(parFileTree),
m_ignore_errors(parIgnoreErrors)
{
assert(m_file_tree_task);
}
Hashing::~Hashing() noexcept {
}
std::vector<FileRecordData>& Hashing::on_data_get() {
return m_file_tree_task->get_or_create();
}
void Hashing::on_data_fill() {
std::vector<FileRecordData>& file_list = m_file_tree_task->get_or_create();
MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0);
hash_dir(file_list.front(), recordlist, m_ignore_errors);
}
} //namespace scantask
} //namespace mchlib

View file

@ -29,26 +29,26 @@ namespace mchlib {
//to be made.
struct FileRecordDataForSearch {
FileRecordDataForSearch ( const char* parPath, uint16_t parLevel, bool parIsDir) :
abs_path(parPath),
path(parPath),
level(parLevel),
is_directory(parIsDir)
{
assert(parPath);
}
boost::string_ref abs_path;
boost::string_ref path;
uint16_t level;
bool is_directory;
};
template <typename RecordType, typename OtherRecord>
bool file_record_data_lt (const RecordType& parLeft, const OtherRecord& parRight) {
const RecordType& l = parLeft;
template <typename OtherRecord>
bool file_record_data_lt (const SetListingItemType& parLeft, const OtherRecord& parRight) {
const SetListingItemType& l = parLeft;
const OtherRecord& r = parRight;
return
(l.level < r.level)
or (l.level == r.level and l.is_directory and not r.is_directory)
or (l.level == r.level and l.is_directory == r.is_directory and l.abs_path < r.abs_path)
or (l.level == r.level and l.is_directory == r.is_directory and l.path < r.path)
//sort by directory - parent first, children later
//(level == o.level and is_dir and not o.is_dir)
@ -99,14 +99,14 @@ namespace mchlib {
{
assert(parBasePath);
assert(m_base_path or m_current == m_end);
assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->abs_path).atom_count());
assert(m_current == m_end or m_base_path->atom_count() == PathName(m_current->path).atom_count());
assert(m_current == m_end or m_base_path->atom_count() == m_current->level + m_level_offset);
//Look for the point where the children of this entry start
while (
m_current != m_end and (
m_current->level + m_level_offset == m_base_path->atom_count() or
*m_base_path != PathName(m_current->abs_path).pop_right()
*m_base_path != PathName(m_current->path).pop_right()
)) {
assert(m_base_path);
++m_current;
@ -157,13 +157,13 @@ namespace mchlib {
template <bool Const>
void DirIterator<Const>::increment() {
assert(PathName(m_current->abs_path).pop_right() == *m_base_path);
assert(PathName(m_current->path).pop_right() == *m_base_path);
do {
++m_current;
} while(
m_current != m_end and
m_current->level + m_level_offset == m_base_path->atom_count() + 1 and
*m_base_path != PathName(m_current->abs_path).pop_right()
*m_base_path != PathName(m_current->path).pop_right()
);
}
@ -222,7 +222,7 @@ namespace mchlib {
assert(std::equal(m_list.begin(), m_list.end(), SetListing(ListType(m_list), true).sorted_list().begin()));
}
if (not m_list.empty()) {
m_base_path.reset(new PathName(m_list.front().abs_path));
m_base_path.reset(new PathName(m_list.front().path));
}
}
@ -258,7 +258,7 @@ namespace mchlib {
return std::count_if(
m_list.begin(),
m_list.end(),
[] (const FileRecordData& parItm) {
[] (const SetListingItemType& parItm) {
return not parItm.is_directory;
}
);
@ -268,7 +268,7 @@ namespace mchlib {
return std::count_if(
m_list.begin(),
m_list.end(),
[] (const FileRecordData& parItm) {
[] (const SetListingItemType& parItm) {
return parItm.is_directory;
}
);
@ -279,33 +279,27 @@ namespace mchlib {
}
void SetListing::sort_list (ListType& parList) {
std::sort(parList.begin(), parList.end(), &file_record_data_lt<FileRecordData, FileRecordData>);
std::sort(parList.begin(), parList.end(), &file_record_data_lt<SetListingItemType>);
}
SetListing::ListType::iterator SetListing::lower_bound (ListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
using boost::string_ref;
FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordData, FileRecordDataForSearch>);
}
SetListing::ShortListType::iterator SetListing::lower_bound (ShortListType& parList, const char* parPath, uint16_t parLevel, bool parIsDir) {
using boost::string_ref;
FileRecordDataForSearch find_record(parPath, parLevel, parIsDir);
return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<ShortFileRecordData, FileRecordDataForSearch>);
return std::lower_bound(parList.begin(), parList.end(), find_record, &file_record_data_lt<FileRecordDataForSearch>);
}
SetListingView<false> SetListing::make_view() {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<false>(m_list.begin(), m_list.end(), offs, m_base_path);
}
SetListingView<true> SetListing::make_view() const {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
}
SetListingView<true> SetListing::make_cview() const {
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().abs_path).atom_count());
const auto offs = (m_list.empty() ? 0 : PathName(m_list.front().path).atom_count());
return SetListingView<true>(m_list.begin(), m_list.end(), offs, m_base_path);
}
@ -317,7 +311,7 @@ namespace mchlib {
m_level_offset(parIter.m_level_offset)
{
if (m_begin != m_end) {
m_base_path.reset(new PathName(m_begin->abs_path));
m_base_path.reset(new PathName(m_begin->path));
}
}
@ -329,7 +323,7 @@ namespace mchlib {
m_level_offset(parLevelOffset)
{
if (m_begin != m_end) {
m_base_path.reset(new PathName(m_begin->abs_path));
m_base_path.reset(new PathName(m_begin->path));
}
}

View file

@ -28,6 +28,7 @@
#include "dbbackend.hpp"
#include "dindexer-machinery/scantask/dirtree.hpp"
#include "dindexer-machinery/scantask/mediatype.hpp"
#include "dindexer-machinery/scantask/hashing.hpp"
#include <iostream>
#include <iomanip>
#include <ciso646>
@ -76,8 +77,16 @@ int main (int parArgc, char* parArgv[]) {
}
const std::string search_path(vm["search-path"].as<std::string>());
mchlib::scantask::DirTree scan_dirtree(search_path);
mchlib::scantask::MediaType media_type(vm["type"].as<char>(), not vm.count("type"), search_path);
std::shared_ptr<mchlib::scantask::DirTree> scan_dirtree(new mchlib::scantask::DirTree(search_path));
std::shared_ptr<mchlib::scantask::MediaType> media_type(new mchlib::scantask::MediaType((vm.count("type") ? vm["type"].as<char>() : 'O'), not vm.count("type"), search_path));
std::shared_ptr<mchlib::scantask::Hashing> hashing(new mchlib::scantask::Hashing(scan_dirtree, true));
const auto& hashes = hashing->get_or_create();
for (const auto& hash : hashes) {
std::cout << mchlib::tiger_to_string(hash.hash) << std::endl;
}
return 0;
#if defined(WITH_MEDIA_AUTODETECT)
//char set_type;