1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2024-11-25 00:53:43 +00:00

Manage read errors.

If instructed to continue on errors, store info correctly in the db.
This commit is contained in:
King_DuckZ 2015-12-10 12:13:16 +00:00
parent ed3dea8f2c
commit a0b87e6a2d
8 changed files with 81 additions and 33 deletions

View file

@ -4,7 +4,7 @@
-- Dumped from database version 9.4.5
-- Dumped by pg_dump version 9.4.5
-- Started on 2015-12-04 12:29:56 GMT
-- Started on 2015-12-10 12:11:34 GMT
SET statement_timeout = 0;
SET lock_timeout = 0;
@ -22,7 +22,7 @@ CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;
--
-- TOC entry 2038 (class 0 OID 0)
-- TOC entry 2039 (class 0 OID 0)
-- Dependencies: 178
-- Name: EXTENSION plpgsql; Type: COMMENT; Schema: -; Owner:
--
@ -70,12 +70,23 @@ CREATE TABLE files (
is_hash_valid boolean DEFAULT true NOT NULL,
access_time timestamp with time zone,
modify_time timestamp with time zone,
CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0)))
unreadable boolean NOT NULL,
CONSTRAINT chk_files_dirsize_zero CHECK (((is_directory = false) OR (size = 0))),
CONSTRAINT chk_hash_0 CHECK ((((NOT unreadable) AND is_hash_valid) OR ((NOT is_hash_valid) AND (hash ~ '^0+$'::text))))
);
ALTER TABLE files OWNER TO @USERNAME@;
--
-- TOC entry 2040 (class 0 OID 0)
-- Dependencies: 175
-- Name: CONSTRAINT chk_hash_0 ON files; Type: COMMENT; Schema: public; Owner: @USERNAME@
--
COMMENT ON CONSTRAINT chk_hash_0 ON files IS 'Make sure hash is 0 if unreadable or not valid are set.';
--
-- TOC entry 174 (class 1259 OID 31279)
-- Name: files_id_seq; Type: SEQUENCE; Schema: public; Owner: @USERNAME@
@ -92,7 +103,7 @@ CREATE SEQUENCE files_id_seq
ALTER TABLE files_id_seq OWNER TO @USERNAME@;
--
-- TOC entry 2039 (class 0 OID 0)
-- TOC entry 2041 (class 0 OID 0)
-- Dependencies: 174
-- Name: files_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
--
@ -118,7 +129,7 @@ CREATE TABLE sets (
ALTER TABLE sets OWNER TO @USERNAME@;
--
-- TOC entry 2040 (class 0 OID 0)
-- TOC entry 2042 (class 0 OID 0)
-- Dependencies: 177
-- Name: COLUMN sets.type; Type: COMMENT; Schema: public; Owner: @USERNAME@
--
@ -148,7 +159,7 @@ CREATE SEQUENCE sets_id_seq
ALTER TABLE sets_id_seq OWNER TO @USERNAME@;
--
-- TOC entry 2041 (class 0 OID 0)
-- TOC entry 2043 (class 0 OID 0)
-- Dependencies: 176
-- Name: sets_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: @USERNAME@
--
@ -165,7 +176,7 @@ ALTER TABLE ONLY files ALTER COLUMN id SET DEFAULT nextval('files_id_seq'::regcl
--
-- TOC entry 1907 (class 2604 OID 31414)
-- TOC entry 1908 (class 2604 OID 31414)
-- Name: id; Type: DEFAULT; Schema: public; Owner: @USERNAME@
--
@ -173,7 +184,7 @@ ALTER TABLE ONLY sets ALTER COLUMN id SET DEFAULT nextval('sets_id_seq'::regclas
--
-- TOC entry 1915 (class 2606 OID 31289)
-- TOC entry 1916 (class 2606 OID 31289)
-- Name: pk_files_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
--
@ -182,7 +193,7 @@ ALTER TABLE ONLY files
--
-- TOC entry 1919 (class 2606 OID 31420)
-- TOC entry 1920 (class 2606 OID 31420)
-- Name: pk_sets_id; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
--
@ -191,7 +202,7 @@ ALTER TABLE ONLY sets
--
-- TOC entry 1917 (class 2606 OID 31294)
-- TOC entry 1918 (class 2606 OID 31294)
-- Name: uniq_item; Type: CONSTRAINT; Schema: public; Owner: @USERNAME@; Tablespace:
--
@ -200,7 +211,7 @@ ALTER TABLE ONLY files
--
-- TOC entry 1912 (class 1259 OID 31426)
-- TOC entry 1913 (class 1259 OID 31426)
-- Name: fki_files_sets; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace:
--
@ -208,7 +219,7 @@ CREATE INDEX fki_files_sets ON files USING btree (group_id);
--
-- TOC entry 1913 (class 1259 OID 31292)
-- TOC entry 1914 (class 1259 OID 31292)
-- Name: idx_paths; Type: INDEX; Schema: public; Owner: @USERNAME@; Tablespace:
--
@ -216,7 +227,7 @@ CREATE INDEX idx_paths ON files USING btree (path);
--
-- TOC entry 1921 (class 2620 OID 31291)
-- TOC entry 1922 (class 2620 OID 31291)
-- Name: triggerupcasehash; Type: TRIGGER; Schema: public; Owner: @USERNAME@
--
@ -224,7 +235,7 @@ CREATE TRIGGER triggerupcasehash BEFORE INSERT OR UPDATE ON files FOR EACH ROW E
--
-- TOC entry 1920 (class 2606 OID 31421)
-- TOC entry 1921 (class 2606 OID 31421)
-- Name: fk_files_sets; Type: FK CONSTRAINT; Schema: public; Owner: @USERNAME@
--
@ -233,7 +244,7 @@ ALTER TABLE ONLY files
--
-- TOC entry 2037 (class 0 OID 0)
-- TOC entry 2038 (class 0 OID 0)
-- Dependencies: 8
-- Name: public; Type: ACL; Schema: -; Owner: postgres
--
@ -244,7 +255,7 @@ GRANT ALL ON SCHEMA public TO postgres;
GRANT ALL ON SCHEMA public TO PUBLIC;
-- Completed on 2015-12-04 12:29:59 GMT
-- Completed on 2015-12-10 12:11:36 GMT
--
-- PostgreSQL database dump complete

View file

@ -83,6 +83,7 @@ namespace din {
#else
("type,t", po::value<char>()->default_value('V'), type_param_help.c_str())
#endif
("ignore-errors", "Move on even if reading a file fails. Unreadable files are marked as such in the db.")
;
po::options_description positional_options("Positional options");
positional_options.add_options()

View file

@ -94,6 +94,7 @@ namespace din {
}
void write_to_db (const DinDBSettings& parDB, const std::vector<FileRecordData>& parData, const SetRecordData& parSetData) {
auto bool_to_str = [](bool b) { return (b ? "true" : "false"); };
if (parData.empty()) {
return;
}
@ -111,7 +112,7 @@ namespace din {
std::ostringstream query;
query << "INSERT INTO \"files\" " <<
"(path, hash, level, group_id, is_directory, is_symlink, size, " <<
"access_time, modify_time) VALUES "
"access_time, modify_time, is_hash_valid, unreadable) VALUES "
;
const char* comma = "";
@ -121,10 +122,12 @@ namespace din {
query << '(' << conn.escaped_literal(itm.path) << ",'" << itm.hash << "',"
<< itm.level << ','
<< "currval('\"sets_id_seq\"')" << ','
<< (itm.is_directory ? "true" : "false") << ','
<< bool_to_str(itm.is_directory) << ','
<< (itm.is_symlink ? "true" : "false") << ',' << itm.size
<< ',' << '\'' << time_to_str(itm.atime, strtime_buff.get(), strtime_buff_size) << '\''
<< ',' << '\'' << time_to_str(itm.mtime, strtime_buff.get(), strtime_buff_size) << '\''
<< ',' << bool_to_str(itm.hash_valid)
<< ',' << bool_to_str(itm.unreadable)
<< ')';
comma = ",";
}

View file

@ -36,6 +36,8 @@ namespace din {
uint64_t size;
bool is_directory;
bool is_symlink;
bool unreadable;
bool hash_valid;
};
struct SetRecordDataFull {

View file

@ -52,7 +52,8 @@ namespace din {
//file_size(0),
level(static_cast<uint16_t>(parSt.level)),
is_dir(parSt.is_dir),
is_symlink(parSt.is_symlink)
is_symlink(parSt.is_symlink),
unreadable(false)
{
}
@ -71,12 +72,13 @@ namespace din {
uint16_t level;
bool is_dir;
bool is_symlink;
bool unreadable;
};
namespace {
typedef std::vector<FileEntry>::iterator FileEntryIt;
void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback) {
void hash_dir (FileEntryIt parEntry, FileEntryIt parBegin, FileEntryIt parEnd, const PathName& parCurrDir, std::function<void(std::size_t)> parNextItemCallback, bool parIgnoreErrors) {
assert(parEntry != parEnd);
assert(parEntry->is_dir);
FileEntry& curr_entry = *parEntry;
@ -105,7 +107,7 @@ namespace din {
while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
PathName curr_subdir(it_entry->path);
if (it_entry->is_dir) {
hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback);
hash_dir(it_entry, parBegin, parEnd, curr_subdir, parNextItemCallback, parIgnoreErrors);
std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
const auto old_size = dir_blob.size();
@ -148,7 +150,19 @@ namespace din {
std::cout << "Hashing file " << it_entry->path << "...";
#endif
parNextItemCallback(it_entry - parBegin);
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
try {
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
}
catch (const std::ios_base::failure& e) {
if (parIgnoreErrors) {
it_entry->unreadable = true;
it_entry->hash = HashType {};
}
else {
throw e;
}
}
#if defined(INDEXER_VERBOSE)
std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
#endif
@ -178,6 +192,7 @@ namespace din {
std::condition_variable step_notify;
#endif
std::size_t file_count;
bool ignore_read_errors;
};
bool FileEntry::operator< (const FileEntry& parOther) const {
@ -262,7 +277,8 @@ namespace din {
++m_local_data->done_count;
m_local_data->processing_index = parNext;
m_local_data->step_notify.notify_all();
}
},
m_local_data->ignore_read_errors
);
assert(m_local_data->done_count == m_local_data->file_count);
@ -272,7 +288,8 @@ namespace din {
m_local_data->paths.begin(),
m_local_data->paths.end(),
base_path,
[](std::size_t) {}
[](std::size_t) {},
m_local_data->ignore_read_errors
);
#endif
@ -310,7 +327,9 @@ namespace din {
itm.level,
itm.file_size,
itm.is_dir,
itm.is_symlink
itm.is_symlink,
itm.unreadable,
not itm.unreadable
});
}
@ -382,4 +401,8 @@ namespace din {
std::advance(it, parIndex);
return make_relative_path(PathName(m_local_data->paths.front().path), PathName(it->path)).path();
}
void Indexer::ignore_read_errors (bool parIgnore) {
m_local_data->ignore_read_errors = parIgnore;
}
} //namespace din

View file

@ -60,6 +60,7 @@ namespace din {
void calculate_hash ( void );
bool add_to_db ( const std::string& parSetName, char parType, bool parForce=false ) const;
bool empty ( void ) const;
void ignore_read_errors ( bool parIgnore );
private:
struct LocalData;

View file

@ -97,6 +97,7 @@ int main (int parArgc, char* parArgv[]) {
std::cout << "constructing...\n";
din::Indexer indexer(settings);
indexer.ignore_read_errors(vm.count("ignore-errors") > 0);
fastf::FileSearcher searcher(search_path);
fastf::FileSearcher::ConstCharVecType ext, ignore;
searcher.SetFollowSymlinks(true);

View file

@ -58,6 +58,7 @@ namespace din {
tiger_init_hash(parHashFile);
std::ifstream src(parPath, std::ios::binary);
src.exceptions(src.badbit); //Throw on read error
src.seekg(0, std::ios_base::end);
const auto file_size = src.tellg();
src.seekg(0, std::ios_base::beg);
@ -68,15 +69,19 @@ namespace din {
char* const buff_ptr = reinterpret_cast<char*>((reinterpret_cast<std::intptr_t>(buff.get()) + 63) & (-64));
assert(buff_ptr >= buff.get() and buff_ptr + buffsize <= buff.get() + 63 + buffsize);
//Take a copy of parHashDir and work on it - if hashing fails at some
//point, we need to leave the dir's hash untouched.
auto hash_dir = parHashDir;
//Use the initial value of the dir's hash as if it was part of the data to hash and start
//by processing that value. Hash is reset to the initial value before the call to tiger.
{
std::copy(parHashDir.byte_data, parHashDir.byte_data + sizeof(parHashDir), buff_ptr);
assert(hash_size >= static_cast<FileSizeType>(sizeof(parHashDir)));
std::fill(buff_ptr + sizeof(parHashDir), buff_ptr + hash_size, 0);
std::copy(hash_dir.byte_data, hash_dir.byte_data + sizeof(hash_dir), buff_ptr);
assert(hash_size >= static_cast<FileSizeType>(sizeof(hash_dir)));
std::fill(buff_ptr + sizeof(hash_dir), buff_ptr + hash_size, 0);
TigerHash dummy {};
tiger_init_hash(parHashDir);
tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, parHashDir.data);
tiger_init_hash(hash_dir);
tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, hash_dir.data);
}
auto remaining = file_size;
@ -86,7 +91,7 @@ namespace din {
assert(buffsize % 64 == 0);
remaining -= buffsize;
src.read(buff_ptr, buffsize);
tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, parHashDir.data);
tiger_sse2_chunk(buff_ptr, buff_ptr, buffsize, parHashFile.data, hash_dir.data);
}
{
@ -97,7 +102,7 @@ namespace din {
assert(aligned_size <= buffsize);
const char* read_from_buff = buff_ptr;
if (aligned_size) {
tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, parHashDir.data);
tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, hash_dir.data);
assert((remaining & 63) == remaining - aligned_size);
remaining -= aligned_size;
read_from_buff += aligned_size;
@ -105,10 +110,11 @@ namespace din {
//Remember to pass the augmented data size for the second reallength value: we passed the initial
//dir's hash value (64 bytes) as if they were part of the data.
tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, parHashDir.data, g_tiger_padding);
tiger_sse2_last_chunk(read_from_buff, read_from_buff, remaining, file_size, file_size + hash_size, parHashFile.data, hash_dir.data, g_tiger_padding);
}
parSizeOut = static_cast<uint64_t>(file_size);
parHashDir = hash_dir;
}
std::string tiger_to_string (const TigerHash& parHash, bool parUpcase) {