From e95cd6cc44d15ab440821bbc9145c9812dee7bbd Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Sat, 12 Mar 2016 00:32:15 +0100 Subject: [PATCH] Add a --byhash switch to the locate command to search by content. This is mostly to demonstrate the new task classes. --- src/locate/CMakeLists.txt | 2 + src/locate/commandline.cpp | 1 + src/locate/hash.cpp | 98 ++++++++++++++++++++++++++++++ src/locate/hash.hpp | 28 +++++++++ src/locate/main.cpp | 11 +++- src/locate/postgre_locate.cpp | 36 +++++++---- src/locate/postgre_locate.hpp | 5 ++ src/machinery/scantask/hashing.cpp | 28 ++++++++- 8 files changed, 195 insertions(+), 14 deletions(-) create mode 100644 src/locate/hash.cpp create mode 100644 src/locate/hash.hpp diff --git a/src/locate/CMakeLists.txt b/src/locate/CMakeLists.txt index 77128dd..7e456e6 100644 --- a/src/locate/CMakeLists.txt +++ b/src/locate/CMakeLists.txt @@ -4,6 +4,7 @@ add_executable(${PROJECT_NAME} main.cpp commandline.cpp postgre_locate.cpp + hash.cpp ) target_include_directories(${PROJECT_NAME} @@ -13,6 +14,7 @@ target_include_directories(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME} PRIVATE ${bare_name}-if PRIVATE ${bare_name}-common + PRIVATE ${bare_name}-machinery ) string(REPLACE "${bare_name}-" "" ACTION_NAME "${PROJECT_NAME}") diff --git a/src/locate/commandline.cpp b/src/locate/commandline.cpp index b6d3abf..2f51977 100644 --- a/src/locate/commandline.cpp +++ b/src/locate/commandline.cpp @@ -28,6 +28,7 @@ namespace din { set_options.add_options() ("case-insensitive,i", "Disable case sensitivity during search") ("set,s", "Look for matching sets instead of files") + ("byhash,a", "Paths on the command line are local paths and searching should be done by content hash") //("option,o", po::value()->default_value("default_value"), "Help message") //("option2", po::value(), "Help message") ; diff --git a/src/locate/hash.cpp b/src/locate/hash.cpp new file mode 100644 index 0000000..ceb0bde --- /dev/null +++ b/src/locate/hash.cpp @@ -0,0 +1,98 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#include "hash.hpp" +#include "dindexer-machinery/scantask/hashing.hpp" +#include "dindexer-machinery/scantask/dirtree.hpp" +#include "dindexer-machinery/recorddata.hpp" +#include +#include +#include +#include +#include +#include + +namespace stask = mchlib::scantask; + +namespace din { + namespace { + class SingleFileTask : public stask::Base> { + public: + typedef std::vector PathList; + + SingleFileTask ( std::string parPath, const struct stat* parStat ); + virtual ~SingleFileTask ( void ) noexcept = default; + + private: + virtual void on_data_destroy ( PathList& parData ) override; + virtual void on_data_create ( PathList& parData ) override; + + std::string m_path; + const struct stat* m_stat; + }; + + SingleFileTask::SingleFileTask (std::string parPath, const struct stat* parStat) : + m_path(std::move(parPath)), + m_stat(parStat) + { + assert(not m_path.empty()); + assert(m_stat); + } + + void SingleFileTask::on_data_destroy (PathList& parData) { + assert(not parData.empty()); + parData.clear(); + } + + void SingleFileTask::on_data_create (PathList& parData) { + assert(parData.empty()); + parData.reserve(1); + parData.push_back(mchlib::FileRecordData( + std::string(m_path), + 0, + m_stat->st_atime, + m_stat->st_mtime, + 0, + false, + false + )); + } + } //unnamed namespace + + mchlib::TigerHash hash (const std::string& parPath) { + using mchlib::FileRecordData; + using HashingTaskPtr = std::shared_ptr; + + struct stat path_stat; + const int retval = stat(parPath.c_str(), &path_stat); + if (retval) { + throw std::runtime_error("Can't access file \"" + parPath + "\""); + } + + std::shared_ptr>> file_src_task; + if (S_ISDIR(path_stat.st_mode)) { + file_src_task.reset(new stask::DirTree(parPath)); + } + else { + assert(S_ISREG(path_stat.st_mode)); + file_src_task.reset(new SingleFileTask(parPath, &path_stat)); + } + + auto hashing = HashingTaskPtr(new stask::Hashing(file_src_task, false)); + return hashing->get_or_create().front().hash; + } +} //namespace din diff --git a/src/locate/hash.hpp b/src/locate/hash.hpp new file mode 100644 index 0000000..314006c --- /dev/null +++ b/src/locate/hash.hpp @@ -0,0 +1,28 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#ifndef id3F3E29B28FAA44A190451198CF1FD166 +#define id3F3E29B28FAA44A190451198CF1FD166 + +#include "dindexer-machinery/tiger.hpp" +#include + +namespace din { + mchlib::TigerHash hash ( const std::string& parPath ); +} //namespace din + +#endif diff --git a/src/locate/main.cpp b/src/locate/main.cpp index a3ecd9b..f548b2d 100644 --- a/src/locate/main.cpp +++ b/src/locate/main.cpp @@ -19,6 +19,7 @@ #include "postgre_locate.hpp" #include "dindexer-common/settings.hpp" #include "dindexerConfig.h" +#include "hash.hpp" #include #include #include @@ -73,7 +74,15 @@ int main (int parArgc, char* parArgv[]) { std::copy(results.begin(), results.end(), std::ostream_iterator(std::cout, "\n")); } else { - const auto results = din::locate_in_db(settings.db, vm["substring"].as(), not not vm.count("case-insensitive")); + std::vector results; + + if (vm.count("byhash")) { + const auto hash = din::hash(vm["substring"].as()); + results = din::locate_in_db(settings.db, hash); + } + else { + results = din::locate_in_db(settings.db, vm["substring"].as(), not not vm.count("case-insensitive")); + } std::copy(results.begin(), results.end(), std::ostream_iterator(std::cout, "\n")); } return 0; diff --git a/src/locate/postgre_locate.cpp b/src/locate/postgre_locate.cpp index 3b20003..2df6632 100644 --- a/src/locate/postgre_locate.cpp +++ b/src/locate/postgre_locate.cpp @@ -17,6 +17,7 @@ #include "postgre_locate.hpp" #include "pq/connection.hpp" +#include "dindexer-machinery/tiger.hpp" #include #include #include @@ -53,10 +54,25 @@ namespace din { return std::move(retval); } + + std::vector file_result_to_vec (pq::ResultSet&& parResult) { + using boost::lexical_cast; + + std::vector retval; + retval.reserve(parResult.size()); + for (const auto& record : parResult) { + retval.push_back(LocatedItem{ + record["path"], + lexical_cast(record["id"]), + lexical_cast(record["group_id"]) + }); + } + + return std::move(retval); + } } //unnamed namespace std::vector locate_in_db (const dinlib::SettingsDB& parDB, const std::string& parSearch, bool parCaseInsensitive) { - using boost::lexical_cast; using boost::string_ref; namespace ba = boost::algorithm; @@ -78,17 +94,15 @@ namespace din { oss << "LIMIT " << g_max_results << ';'; auto result = conn.query(oss.str()); - std::vector retval; - retval.reserve(result.size()); - for (const auto& record : result) { - retval.push_back(LocatedItem{ - record["path"], - lexical_cast(record["id"]), - lexical_cast(record["group_id"]) - }); - } + return file_result_to_vec(std::move(result)); + } - return std::move(retval); + std::vector locate_in_db (const dinlib::SettingsDB& parDB, const mchlib::TigerHash& parSearch) { + const std::string query = std::string("SELECT \"path\",\"id\",\"group_id\" FROM \"files\" WHERE \"hash\"=$1 LIMIT ") + boost::lexical_cast(g_max_results) + ';'; + + auto conn = make_pq_conn(parDB); + auto result = conn.query(query, mchlib::tiger_to_string(parSearch, true)); + return file_result_to_vec(std::move(result)); } std::vector locate_sets_in_db (const dinlib::SettingsDB& parDB, const std::string& parSearch, bool parCaseInsensitive) { diff --git a/src/locate/postgre_locate.hpp b/src/locate/postgre_locate.hpp index 3dd01fb..917d687 100644 --- a/src/locate/postgre_locate.hpp +++ b/src/locate/postgre_locate.hpp @@ -23,6 +23,10 @@ #include #include +namespace mchlib { + struct TigerHash; +} //namespace mchlib + namespace din { struct LocatedItem { std::string path; @@ -38,6 +42,7 @@ namespace din { }; std::vector locate_in_db ( const dinlib::SettingsDB& parDB, const std::string& parSearch, bool parCaseInsensitive ); + std::vector locate_in_db ( const dinlib::SettingsDB& parDB, const mchlib::TigerHash& parSearch ); std::vector locate_sets_in_db ( const dinlib::SettingsDB& parDB, const std::string& parSearch, bool parCaseInsensitive ); std::vector locate_sets_in_db ( const dinlib::SettingsDB& parDB, const std::string& parSearch, const std::vector& parSets, bool parCaseInsensitive ); } //namespace din diff --git a/src/machinery/scantask/hashing.cpp b/src/machinery/scantask/hashing.cpp index 0261777..4e42a75 100644 --- a/src/machinery/scantask/hashing.cpp +++ b/src/machinery/scantask/hashing.cpp @@ -145,6 +145,9 @@ namespace mchlib { void Hashing::on_data_fill() { std::vector& file_list = m_file_tree_task->get_or_create(); + if (file_list.empty()) { + return; + } ProgressInfo progr_info; progr_info.callback = m_progress_callback; @@ -153,8 +156,29 @@ namespace mchlib { progr_info.total_bytes_read = 0; progr_info.file_num = 0; - MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0); - hash_dir(file_list.front(), recordlist, m_ignore_errors, progr_info); + if (file_list.front().is_directory) { + MutableSetListingView recordlist(file_list.begin(), file_list.end(), 0); + hash_dir(file_list.front(), recordlist, m_ignore_errors, progr_info); + } + else { + assert(1 == file_list.size()); + auto& curr_file_rec = file_list.front(); + TigerHash dummy {}; + + try { + tiger_file(curr_file_rec.abs_path, curr_file_rec.hash, dummy, curr_file_rec.size); + curr_file_rec.hash_valid = true; + } + catch (const std::ios_base::failure& e) { + if (m_ignore_errors) { + curr_file_rec.unreadable = true; + curr_file_rec.hash = TigerHash {}; + } + else { + throw e; + } + } + } } void Hashing::set_progress_callback (ProgressCallback parFunc) {