mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2024-11-25 00:53:43 +00:00
Manage read errors.
If instructed to continue on errors, store info correctly in the db.
This commit is contained in:
parent
ed3dea8f2c
commit
a0b87e6a2d
8 changed files with 81 additions and 33 deletions
43
dindexer.sql
43
dindexer.sql
|
@ -4,7 +4,7 @@
|
|||
|
||||
-- Dumped from database version 9.4.5
|
||||
-- Dumped by pg_dump version 9.4.5
|
||||
-- Started on 2015-12-04 12:29:56 GMT
|
||||
-- Started on 2015-12-10 12:11:34 GMT
|
||||
|
||||
SET statement_timeout = 0;
|
||||
SET lock_timeout = 0;
|
||||
|
@ -22,7 +22,7 @@ CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 2038 (class 0 OID 0)
|
||||
-- TOC entry 2039 (class 0 OID 0)
|
||||
-- Dependencies: 178
|
||||
-- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner:
|
||||
--
|
||||
|
@ -70,12 +70,23 @@ CREATE TABLE files (
|
|||
is_hash_valid boolean DEFAULT true NOT NULL,
|
||||
access_time timestamp with time zone,
|
||||
modify_time timestamp with time zone,
|
||||
CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0)))
|
||||
unreadable boolean NOT NULL,
|
||||
CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0))),
|
||||
CONSTRAINT chk_hash_0 CHECK ((((NOT unreadable) AND is_hash_valid) OR ((NOT is_hash_valid) AND (hash ~ '^0+$'::text))))
|
||||
);
|
||||
|
||||
|
||||
ALTER TABLE files OWNER TO @USERNAME@;
|
||||
|
||||
--
|
||||
-- TOC entry 2040 (class 0 OID 0)
|
||||
-- Dependencies: 175
|
||||
-- Name: CONSTRAINT chk_hash_0 ON files; Type: COMMENT; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
||||
COMMENT ON CONSTRAINT chk_hash_0 ON files IS 'Make sure hash is 0 if unreadable or not valid are set.';
|
||||
|
||||
|
||||
--
|
||||
-- TOC entry 174 (class 1259 OID 31279)
|
||||
-- Name: files_id_seq; Type: SEQUENCE; Schema: public; Owner: @USERNAME@
|
||||
|
@ -92,7 +103,7 @@ CREATE SEQUENCE files_id_seq
|
|||
ALTER TABLE files_id_seq OWNER TO @USERNAME@;
|
||||
|
||||
--
|
||||
-- TOC entry 2039 (class 0 OID 0)
|
||||
-- TOC entry 2041 (class 0 OID 0)
|
||||
-- Dependencies: 174
|
||||
-- Name: files_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
@ -118,7 +129,7 @@ CREATE TABLE sets (
|
|||
ALTER TABLE sets OWNER TO @USERNAME@;
|
||||
|
||||
--
|
||||
-- TOC entry 2040 (class 0 OID 0)
|
||||
-- TOC entry 2042 (class 0 OID 0)
|
||||
-- Dependencies: 177
|
||||
-- Name: COLUMN sets.type; Type: COMMENT; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
@ -148,7 +159,7 @@ CREATE SEQUENCE sets_id_seq
|
|||
ALTER TABLE sets_id_seq OWNER TO @USERNAME@;
|
||||
|
||||
--
|
||||
-- TOC entry 2041 (class 0 OID 0)
|
||||
-- TOC entry 2043 (class 0 OID 0)
|
||||
-- Dependencies: 176
|
||||
-- Name: sets_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
@ -165,7 +176,7 @@ ALTER TABLE ONLY files ALTER COLUMN id SET DEFAULT nextval('files_id_seq'::regcl
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1907 (class 2604 OID 31414)
|
||||
-- TOC entry 1908 (class 2604 OID 31414)
|
||||
-- Name: id; Type: DEFAULT; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
||||
|
@ -173,7 +184,7 @@ ALTER TABLE ONLY sets ALTER COLUMN id SET DEFAULT nextval('sets_id_seq'::regclas
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1915 (class 2606 OID 31289)
|
||||
-- TOC entry 1916 (class 2606 OID 31289)
|
||||
-- Name: pk_files_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
|
||||
--
|
||||
|
||||
|
@ -182,7 +193,7 @@ ALTER TABLE ONLY files
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1919 (class 2606 OID 31420)
|
||||
-- TOC entry 1920 (class 2606 OID 31420)
|
||||
-- Name: pk_sets_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
|
||||
--
|
||||
|
||||
|
@ -191,7 +202,7 @@ ALTER TABLE ONLY sets
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1917 (class 2606 OID 31294)
|
||||
-- TOC entry 1918 (class 2606 OID 31294)
|
||||
-- Name: uniq_item; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
|
||||
--
|
||||
|
||||
|
@ -200,7 +211,7 @@ ALTER TABLE ONLY files
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1912 (class 1259 OID 31426)
|
||||
-- TOC entry 1913 (class 1259 OID 31426)
|
||||
-- Name: fki_files_sets; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace:
|
||||
--
|
||||
|
||||
|
@ -208,7 +219,7 @@ CREATE INDEX fki_files_sets ON files USING btree (group_id);
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1913 (class 1259 OID 31292)
|
||||
-- TOC entry 1914 (class 1259 OID 31292)
|
||||
-- Name: idx_paths; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace:
|
||||
--
|
||||
|
||||
|
@ -216,7 +227,7 @@ CREATE INDEX idx_paths ON files USING btree (path);
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1921 (class 2620 OID 31291)
|
||||
-- TOC entry 1922 (class 2620 OID 31291)
|
||||
-- Name: triggerupcasehash; Type: TRIGGER; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
||||
|
@ -224,7 +235,7 @@ CREATE TRIGGER triggerupcasehash BEFORE INSERT OR UPDATE ON files FOR EACH ROW E
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 1920 (class 2606 OID 31421)
|
||||
-- TOC entry 1921 (class 2606 OID 31421)
|
||||
-- Name: fk_files_sets; Type: FK CONSTRAINT; Schema: public; Owner: @USERNAME@
|
||||
--
|
||||
|
||||
|
@ -233,7 +244,7 @@ ALTER TABLE ONLY files
|
|||
|
||||
|
||||
--
|
||||
-- TOC entry 2037 (class 0 OID 0)
|
||||
-- TOC entry 2038 (class 0 OID 0)
|
||||
-- Dependencies: 8
|
||||
-- Name: public; Type: ACL; Schema: -; Owner: postgres
|
||||
--
|
||||
|
@ -244,7 +255,7 @@ GRANT ALL ON SCHEMA public TO postgres;
|
|||
GRANT ALL ON SCHEMA public TO PUBLIC;
|
||||
|
||||
|
||||
-- Completed on 2015-12-04 12:29:59 GMT
|
||||
-- Completed on 2015-12-10 12:11:36 GMT
|
||||
|
||||
--
|
||||
-- PostgreSQL database dump complete
|
||||
|
|
|
@ -83,6 +83,7 @@ namespace din {
|
|||
#else
|
||||
("type,t", po::value<char>()->default_value('V'), type_param_help.c_str())
|
||||
#endif
|
||||
("ignore-errors", "Move on even if reading a file fails. Unreadable files are marked as such in the db.")
|
||||
;
|
||||
po::options_description positional_options("Positional options");
|
||||
positional_options.add_options()
|
||||
|
|
|
@ -94,6 +94,7 @@ namespace din {
|
|||
}
|
||||
|
||||
void write_to_db (const DinDBSettings& parDB, const std::vector<FileRecordData>& parData, const SetRecordData& parSetData) {
|
||||
auto bool_to_str = [](bool b) { return (b ? "true" : "false"); };
|
||||
if (parData.empty()) {
|
||||
return;
|
||||
}
|
||||
|
@ -111,7 +112,7 @@ namespace din {
|
|||
std::ostringstream query;
|
||||
query << "INSERT INTO \"files\" " <<
|
||||
"(path, hash, level, group_id, is_directory, is_symlink, size, " <<
|
||||
"access_time, modify_time) VALUES "
|
||||
"access_time, modify_time, is_hash_valid, unreadable) VALUES "
|
||||
;
|
||||
|
||||
const char* comma = "";
|
||||
|
@ -121,10 +122,12 @@ namespace din {
|
|||
query << '(' << conn.escaped_literal(itm.path) << ",'" << itm.hash << "',"
|
||||
<< itm.level << ','
|
||||
<< "currval('\"sets_id_seq\"')" << ','
|
||||
<< (itm.is_directory ? "true" : "false") << ','
|
||||
<< bool_to_str(itm.is_directory) << ','
|
||||
<< (itm.is_symlink ? "true" : "false") << ',' << itm.size
|
||||
<< ',' << '\'' << time_to_str(itm.atime, strtime_buff.get(), strtime_buff_size) << '\''
|
||||
<< ',' << '\'' << time_to_str(itm.mtime, strtime_buff.get(), strtime_buff_size) << '\''
|
||||
<< ',' << bool_to_str(itm.hash_valid)
|
||||
<< ',' << bool_to_str(itm.unreadable)
|
||||
<< ')';
|
||||
comma = ",";
|
||||
}
|
||||
|
|
|
@ -36,6 +36,8 @@ namespace din {
|
|||
uint64_t size;
|
||||
bool is_directory;
|
||||
bool is_symlink;
|
||||
bool unreadable;
|
||||
bool hash_valid;
|
||||
};
|
||||
|
||||
struct SetRecordDataFull {
|
||||
|
|
|
@ -52,7 +52,8 @@ namespace din {
|
|||
//file_size(0),
|
||||
level(static_cast<uint16_t>(parSt.level)),
|
||||
is_dir(parSt.is_dir),
|
||||
is_symlink(parSt.is_symlink)
|
||||
is_symlink(parSt.is_symlink),
|
||||
unreadable(false)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -71,12 +72,13 @@ namespace din {
|
|||
uint16_t level;
|
||||
bool is_dir;
|
||||
bool is_symlink;
|
||||
bool unreadable;
|
||||
};
|
||||
|
||||
namespace {
|
||||
typedef std::vector<FileEntry>::iterator FileEntryIt;
|
||||
|
||||
void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback) {
|
||||
void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback, bool parIgnoreErrors) {
|
||||
assert(parEntry != parEnd);
|
||||
assert(parEntry->is_dir);
|
||||
FileEntry& curr_entry = *parEntry;
|
||||
|
@ -105,7 +107,7 @@ namespace din {
|
|||
while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
|
||||
PathName curr_subdir(it_entry->path);
|
||||
if (it_entry->is_dir) {
|
||||
hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback);
|
||||
hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback, parIgnoreErrors);
|
||||
|
||||
std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
|
||||
const auto old_size = dir_blob.size();
|
||||
|
@ -148,7 +150,19 @@ namespace din {
|
|||
std::cout << "Hashing file " << it_entry->path << "...";
|
||||
#endif
|
||||
parNextItemCallback(it_entry - parBegin);
|
||||
try {
|
||||
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
|
||||
}
|
||||
catch (const std::ios_base::failure& e) {
|
||||
if (parIgnoreErrors) {
|
||||
it_entry->unreadable = true;
|
||||
it_entry->hash = HashType {};
|
||||
}
|
||||
else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(INDEXER_VERBOSE)
|
||||
std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
|
||||
#endif
|
||||
|
@ -178,6 +192,7 @@ namespace din {
|
|||
std::condition_variable step_notify;
|
||||
#endif
|
||||
std::size_t file_count;
|
||||
bool ignore_read_errors;
|
||||
};
|
||||
|
||||
bool FileEntry::operator< (const FileEntry& parOther) const {
|
||||
|
@ -262,7 +277,8 @@ namespace din {
|
|||
++m_local_data->done_count;
|
||||
m_local_data->processing_index = parNext;
|
||||
m_local_data->step_notify.notify_all();
|
||||
}
|
||||
},
|
||||
m_local_data->ignore_read_errors
|
||||
);
|
||||
|
||||
assert(m_local_data->done_count == m_local_data->file_count);
|
||||
|
@ -272,7 +288,8 @@ namespace din {
|
|||
m_local_data->paths.begin(),
|
||||
m_local_data->paths.end(),
|
||||
base_path,
|
||||
[](std::size_t) {}
|
||||
[](std::size_t) {},
|
||||
m_local_data->ignore_read_errors
|
||||
);
|
||||
#endif
|
||||
|
||||
|
@ -310,7 +327,9 @@ namespace din {
|
|||
itm.level,
|
||||
itm.file_size,
|
||||
itm.is_dir,
|
||||
itm.is_symlink
|
||||
itm.is_symlink,
|
||||
itm.unreadable,
|
||||
not itm.unreadable
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -382,4 +401,8 @@ namespace din {
|
|||
std::advance(it, parIndex);
|
||||
return make_relative_path(PathName(m_local_data->paths.front().path), PathName(it->path)).path();
|
||||
}
|
||||
|
||||
void Indexer::ignore_read_errors (bool parIgnore) {
|
||||
m_local_data->ignore_read_errors = parIgnore;
|
||||
}
|
||||
} //namespace din
|
||||
|
|
|
@ -60,6 +60,7 @@ namespace din {
|
|||
void calculate_hash ( void );
|
||||
bool add_to_db ( const std::string& parSetName, char parType, bool parForce=false ) const;
|
||||
bool empty ( void ) const;
|
||||
void ignore_read_errors ( bool parIgnore );
|
||||
|
||||
private:
|
||||
struct LocalData;
|
||||
|
|
|
@ -97,6 +97,7 @@ int main (int parArgc, char* parArgv[]) {
|
|||
std::cout << "constructing...\n";
|
||||
|
||||
din::Indexer indexer(settings);
|
||||
indexer.ignore_read_errors(vm.count("ignore-errors") > 0);
|
||||
fastf::FileSearcher searcher(search_path);
|
||||
fastf::FileSearcher::ConstCharVecType ext, ignore;
|
||||
searcher.SetFollowSymlinks(true);
|
||||
|
|
|
@ -58,6 +58,7 @@ namespace din {
|
|||
tiger_init_hash(parHashFile);
|
||||
|
||||
std::ifstream src(parPath, std::ios::binary);
|
||||
src.exceptions(src.badbit); //Throw on read error
|
||||
src.seekg(0, std::ios_base::end);
|
||||
const auto file_size = src.tellg();
|
||||
src.seekg(0, std::ios_base::beg);
|
||||
|
@ -68,15 +69,19 @@ namespace din {
|
|||
char* const buff_ptr = reinterpret_cast<char*>((reinterpret_cast<std::intptr_t>(buff.get()) + 63) & (-64));
|
||||
assert(buff_ptr >= buff.get() and buff_ptr + buffsize <= buff.get() + 63 + buffsize);
|
||||
|
||||
//Take a copy of parHashDir and work on it - if hashing fails at some
|
||||
//point, we need to leave the dir's hash untouched.
|
||||
auto hash_dir = parHashDir;
|
||||
|
||||
//Use the initial value of the dir's hash as if it was part of the data to hash and start
|
||||
//by processing that value. Hash is reset to the initial value before the call to tiger.
|
||||
{
|
||||
std::copy(parHashDir.byte_data, parHashDir.byte_data + sizeof(parHashDir), buff_ptr);
|
||||
assert(hash_size >= static_cast<FileSizeType>(sizeof(parHashDir)));
|
||||
std::fill(buff_ptr + sizeof(parHashDir), buff_ptr + hash_size, 0);
|
||||
std::copy(hash_dir.byte_data, hash_dir.byte_data + sizeof(hash_dir), buff_ptr);
|
||||
assert(hash_size >= static_cast<FileSizeType>(sizeof(hash_dir)));
|
||||
std::fill(buff_ptr + sizeof(hash_dir), buff_ptr + hash_size, 0);
|
||||
TigerHash dummy {};
|
||||
tiger_init_hash(parHashDir);
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, parHashDir.data);
|
||||
tiger_init_hash(hash_dir);
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, hash_dir.data);
|
||||
}
|
||||
|
||||
auto remaining = file_size;
|
||||
|
@ -86,7 +91,7 @@ namespace din {
|
|||
assert(buffsize % 64 == 0);
|
||||
remaining -= buffsize;
|
||||
src.read(buff_ptr, buffsize);
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, parHashDir.data);
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, hash_dir.data);
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -97,7 +102,7 @@ namespace din {
|
|||
assert(aligned_size <= buffsize);
|
||||
const char* read_from_buff = buff_ptr;
|
||||
if (aligned_size) {
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, parHashDir.data);
|
||||
tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, hash_dir.data);
|
||||
assert((remaining & 63) == remaining - aligned_size);
|
||||
remaining -= aligned_size;
|
||||
read_from_buff += aligned_size;
|
||||
|
@ -105,10 +110,11 @@ namespace din {
|
|||
|
||||
//Remember to pass the augmented data size for the second reallength value: we passed the initial
|
||||
//dir's hash value (64 bytes) as if they were part of the data.
|
||||
tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, parHashDir.data, g_tiger_padding);
|
||||
tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, hash_dir.data, g_tiger_padding);
|
||||
}
|
||||
|
||||
parSizeOut = static_cast<uint64_t>(file_size);
|
||||
parHashDir = hash_dir;
|
||||
}
|
||||
|
||||
std::string tiger_to_string (const TigerHash& parHash, bool parUpcase) {
|
||||
|
|
Loading…
Reference in a new issue