diff --git a/.gitignore b/.gitignore index 8b3728b..9d8fb01 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ target *.zip Python-3.10.12 -test_temp_dir* \ No newline at end of file +test_temp_dir* +app.jar +app.yml \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 59351df..86d4cf7 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,3 +72,4 @@ tempfile="3.8" # "z": optimize for binary size, but also turn off loop vectorization. opt-level = 3 # Use slightly better optimizations. overflow-checks = false # Disable integer overflow checks. + diff --git a/telego/README b/scripts/telego/README similarity index 50% rename from telego/README rename to scripts/telego/README index 599a364..28c7d84 100644 --- a/telego/README +++ b/scripts/telego/README @@ -1 +1,2 @@ +lnk `bin_Waverless` and `dist_waverless` to telego project dir https://qcnoe3hd7k5c.feishu.cn/wiki/HKyFwat29i8PiEkxhCQcW9NdnTe \ No newline at end of file diff --git a/scripts/telego/bin_waverless/.gitignore b/scripts/telego/bin_waverless/.gitignore new file mode 100644 index 0000000..b512c11 --- /dev/null +++ b/scripts/telego/bin_waverless/.gitignore @@ -0,0 +1,2 @@ +prepare_cache +teledeploy \ No newline at end of file diff --git a/scripts/telego/bin_waverless/deployment.yml b/scripts/telego/bin_waverless/deployment.yml new file mode 100644 index 0000000..530984b --- /dev/null +++ b/scripts/telego/bin_waverless/deployment.yml @@ -0,0 +1,49 @@ +comment: 存算融合的serverless计算平台 + +# 嵌入式的安装脚本,上传到 teledeploy 公共fileserver +local_values: + pack_py: + read_from_file: template/pack.py # require output to prepare_cache + install_crac.py: + read_from_file: template/install_crac.py + install_wasmedge.py: + read_from_file: template/install_wasmedge.py + +prepare: + # 使用脚本预处理,将必要资源准备到 teledeploy,上传到 teledeploy 公共fileserver + - pyscript: ${pack_py} + trans: + - copy: + - run_node.py: teledeploy/waverless_entry_amd64 + - wasm_serverless: teledeploy/waverless_amd64 + - wasm_edge.py: teledeploy/install_wasmedge_inner.py + - jdk_crac.tar.gz: teledeploy/jdk_crac.tar.gz + # 下载wasmedge,上传到 teledeploy 公共fileserver + - url: https://github.com/WasmEdge/WasmEdge/releases/download/0.13.3/WasmEdge-0.13.3-manylinux2014_x86_64.tar.gz + trans: + - copy: + - WasmEdge-0.13.3-manylinux2014_x86_64.tar.gz: teledeploy/WasmEdge-0.13.3-manylinux2014_x86_64.tar.gz + # 考虑到目前telego安装描述还不够,提供脚本安装能力 + - filemap: + content: ${install_crac.py} + path: teledeploy/install_crac.py + mode: 755 + # 脚本安装能力 + - filemap: + content: ${install_wasmedge.py} + path: teledeploy/install_wasmedge.py + mode: 755 + +bin: + # waverless 二进制本体 + waverless: + # waverless 入口脚本 + waverless_entry: + # wasmedge 安装脚本 + wasmedge: + no_default_installer: true + py_installer: "install_wasmedge.py ${BIN_PRJ} ${MAIN_NODE_IP}" + # crac 安装脚本 + crac: + no_default_installer: true + py_installer: "install_crac.py ${BIN_PRJ} ${MAIN_NODE_IP}" diff --git a/scripts/telego/bin_waverless/template/install_crac.py b/scripts/telego/bin_waverless/template/install_crac.py new file mode 100644 index 0000000..c9ed4cb --- /dev/null +++ b/scripts/telego/bin_waverless/template/install_crac.py @@ -0,0 +1,75 @@ +import os,urllib.request,sys + + +install_dir="/teledeploy_secret/bin_crac" +crac_pack="jdk_crac.tar.gz" + +def chdir(dir): + print("chdir:",dir) + os.chdir(dir) + +def os_system(cmd): + print("os_system:",cmd) + os.system(cmd) + +def download(url,file): + file=os.path.abspath(file) + dir=os.path.dirname(file) + os_system(f"mkdir -p {dir}") + print(f"downloading {url} to {file}") + urllib.request.urlretrieve(url,file) + +### utils +def os_system_sure(command): + print(f"执行命令:{command}") + result = os.system(command) + if result != 0: + print(f"命令执行失败:{command}") + exit(1) + print(f"命令执行成功:{command}") + +if len(sys.argv)!=3: + print("usage: python3 install_crac.py ") + exit(1) +BIN_PRJ=sys.argv[1] +MAIN_NODE_IP=sys.argv[2] + +os_system(f"mkdir -p {install_dir}") +chdir(install_dir) + +url=f"http://{MAIN_NODE_IP}:8003/{BIN_PRJ}/{crac_pack}" +download(url,crac_pack) + +# extract jdk_crac.tar.gz to +os_system("tar -xvf jdk_crac.tar.gz") + +# copy jdk_crac to /usr/jdk_crac +os_system_sure("rm -rf /usr/jdk_crac && cp -r jdk_crac /usr/jdk_crac") + +# switch to jdk crac 17 +def switch_to_jdk_crac(): + CRAC_INSTALL_DIR = "/usr/jdk_crac" + bins=[ + "java", + "javac", + "jcmd" + ] + for bin in bins: + os_system_sure(f"update-alternatives --install /usr/bin/{bin} {bin} {CRAC_INSTALL_DIR}/bin/{bin} 100") + os_system_sure(f"update-alternatives --set {bin} {CRAC_INSTALL_DIR}/bin/{bin}") + # Check and update JAVA_HOME in /etc/environment + with open("/root/.bashrc", "r") as env_file: + lines = env_file.readlines() + java_home_set = False + for line in lines: + if line.startswith("export JAVA_HOME="): + line=f"export JAVA_HOME={CRAC_INSTALL_DIR}\n" + java_home_set = True + if not java_home_set: + lines.append(f"export JAVA_HOME={CRAC_INSTALL_DIR}\n") + print("env lines: ",lines) + + with open("/root/.bashrc", "w") as env_file: + env_file.writelines(lines) + print("\nsuccess switch to jdk_crac") +switch_to_jdk_crac() \ No newline at end of file diff --git a/scripts/telego/bin_waverless/template/install_wasmedge.py b/scripts/telego/bin_waverless/template/install_wasmedge.py new file mode 100644 index 0000000..3393847 --- /dev/null +++ b/scripts/telego/bin_waverless/template/install_wasmedge.py @@ -0,0 +1,39 @@ +import os,urllib.request,sys + + +install_dir="/teledeploy_secret/bin_wasmedge" +files=[ + ["install_wasmedge_inner.py","./"], + ["WasmEdge-0.13.3-manylinux2014_x86_64.tar.gz","/tmp/install/"] +] + +def chdir(dir): + print("chdir:",dir) + os.chdir(dir) + +def os_system(cmd): + print("os_system:",cmd) + os.system(cmd) + +def download(url,file): + file=os.path.abspath(file) + dir=os.path.dirname(file) + os_system(f"mkdir -p {dir}") + print(f"downloading {url} to {file}") + urllib.request.urlretrieve(url,file) + +if len(sys.argv)!=3: + print("usage: python3 install_wasmedge.py ") + exit(1) +BIN_PRJ=sys.argv[1] +MAIN_NODE_IP=sys.argv[2] + +os_system(f"mkdir -p {install_dir}") +chdir(install_dir) + +for file in files: + url=f"http://{MAIN_NODE_IP}:8003/{BIN_PRJ}/{file[0]}" + file=os.path.join(file[1],file[0]) + download(url,file) + +os_system("python3 wasmedge_local_install.py") \ No newline at end of file diff --git a/scripts/telego/bin_waverless/template/pack.py b/scripts/telego/bin_waverless/template/pack.py new file mode 100644 index 0000000..3f7eadf --- /dev/null +++ b/scripts/telego/bin_waverless/template/pack.py @@ -0,0 +1,76 @@ +import os + +PRJ_DIR=os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +DOWNLOAD_CACHE_DIR=os.path.join(PRJ_DIR,"prepare_cache") +WAVERLESS_PATH="/root/serverless_benchmark_plus/middlewares/waverless/waverless" + +######### + +waverless_benchmark_path=os.path.abspath(os.path.join(WAVERLESS_PATH,"../../..")) + +# cmd means the necessary command to prepare the resource +# rsc means the resource to be prepared +rscs=[ + [ # binary + {"cmd":"python3 "+os.path.join(WAVERLESS_PATH,"scripts/build/1.1build_core.py")}, + {"rsc":os.path.join(WAVERLESS_PATH,"scripts/build/pack/waverless_backend/wasm_serverless")}, + ], + [ # entry script + {"rsc":os.path.join(WAVERLESS_PATH,"scripts/build/template/run_node.py")}, + ], + [ # wasmedge installer + {"rsc":os.path.join(WAVERLESS_PATH,"scripts/install/inner/wasm_edge.py")}, + ], + [ # crac + {"cmd":"python3 "+os.path.join(WAVERLESS_PATH,"scripts/install/inner/install_crac.py && "+ + "mkdir -p /teledeploy_secret/waverless && " + "rm -f /teledeploy_secret/waverless/jdk_crac.tar.gz && "+ + "tar -czvf /teledeploy_secret/waverless/jdk_crac.tar.gz -C /usr jdk_crac")}, + {"rsc":"/teledeploy_secret/waverless/jdk_crac.tar.gz"} + ] +] + + +def chdir(dir): + print("chdir:",dir) + os.chdir(dir) + +def os_system(cmd): + print("os_system:",cmd) + os.system(cmd) + +def os_system_sure(cmd): + print("os_system_sure:",cmd) + res=os.system(cmd) + if res!=0: + raise Exception(f"os_system_sure failed: {cmd}") + +for rsc_ in rscs: + cmd="" + rsc="" + copy="" + + for item in rsc_: + if "rsc" in item: + rsc=item["rsc"] + + for item in rsc_: + if rsc=="": + # 没有目标资源绑定,每次都执行脚本 + if "cmd" in item: + cmd=item["cmd"] + os_system_sure(cmd) + else: + rsc_file=rsc.split("/")[-1] + cache_rsc=os.path.join(DOWNLOAD_CACHE_DIR,rsc_file) + + # 有目标资源绑定,只有资源不存在时(缓存被删除),才执行脚本,并更新资源 + if not os.path.exists(cache_rsc): + if "cmd" in item: + cmd=item["cmd"] + os_system_sure(cmd) + # copy to prepare_cache dir + os_system_sure(f"mkdir -p {DOWNLOAD_CACHE_DIR}") + os_system_sure(f"cp -r {rsc} {DOWNLOAD_CACHE_DIR}") + +print("pack waverless related done!") \ No newline at end of file diff --git a/scripts/telego/dist_waverless/deployment.yml b/scripts/telego/dist_waverless/deployment.yml new file mode 100644 index 0000000..8c19281 --- /dev/null +++ b/scripts/telego/dist_waverless/deployment.yml @@ -0,0 +1,113 @@ +# dist服务文档:https://qcnoe3hd7k5c.feishu.cn/wiki/Y9SkwEPmqiTov1knR8KctyJ0nJf +comment: 存算融合的serverless计算平台 + +local_values: {} + +prepare: [] + +dist: + waverless-test: + # 运行在裸机上 + type: raw_metal + # 每个unique服务的配置信息以及全局配置信息 + conf: + global: {port: 2500} + 1: {tag: "[meta, master]"} + 2: {tag: "[meta, worker]"} + 3: {tag: '[meta, worker]'} + # 每个unique服务的分布节点 + distribution: + lab1: [1] + lab2: [2] + lab3: [3] + # 安装脚本 + install: | + telego install --bin-prj bin_waverless2 + # 有状态服务备份 + state_backup: | + # 调试模式,不存储任何东西 + rm -rf test_dir/kv_store_engine* + rm -rf test_dir/apps + rm -rf test_dir/files + + # mkdir -p backup + #mv apps backup || true # allow to fail + #mv files backup || true # allow to fail + #mv kv_store_engine* backup || true # allow to fail + # 有状态服务恢复 + state_restore: | + # mv backup/* . || true # allow to fail + ############################################################ + # + # telego项目结构 + # 二进制执行目录 + # \_ test_dir + # \_ files + # | \_ node_config.yaml + # \_ apps + # | |_ app1 + # | |_ app2 + # | \_ ... + # \_ kv_store_engine + # + ############################################################ + + mkdir -p test_dir/files + mkdir -p test_dir/apps + + ls test_dir > debug_exec_dir.log + ls test_dir/apps > debug_apps_dir.log + + # 根据 dist conf 动态生成目标 config 格式 + cat > gen_nodes_config.py << EOF + import os, subprocess + # DIST_UNIQUE_ID_LIST is env split with , + if 'DIST_UNIQUE_ID_LIST' not in os.environ: + print("DIST_UNIQUE_ID_LIST is not set") + exit(1) + DIST_UNIQUE_ID_LIST = os.environ.get('DIST_UNIQUE_ID_LIST', '').split(',') + if DIST_UNIQUE_ID_LIST and DIST_UNIQUE_ID_LIST[0] == '': + DIST_UNIQUE_ID_LIST = [] + with open("test_dir/files/node_config.yaml", "w") as f: + f.write("nodes:\n") + + for unique_id in DIST_UNIQUE_ID_LIST: + print(f"processing {unique_id}") + + # 使用 os.environ 来获取环境变量 + ip = os.getenv(f"DIST_CONF_{unique_id}_NODE_IP") + port = os.getenv(f"DIST_CONF_{unique_id}_port") + spec = os.getenv(f"DIST_CONF_{unique_id}_tag") + + # 将结果写入 node_config.yaml + f.write(f" {unique_id}:\n") + f.write(f" addr: {ip}:{port}\n") + f.write(f" spec: {spec}\n") + def kill_process_by_port(port): + try: + # 获取监听指定端口的进程ID (PID) + cmd = f"lsof -t -i:{port}" + pid = subprocess.check_output(cmd, shell=True).decode().strip() + + # 杀死对应的进程 + if pid: + os.kill(int(pid), 9) # 9 是 SIGKILL 信号,表示强制终止进程 + print(f"进程 {pid} 已被终止") + else: + print(f"没有找到监听端口 {port} 的进程") + except subprocess.CalledProcessError as e: + print(f"出错了: {e}") + kill_process_by_port("2500") + EOF + + # 启动脚本 + entrypoint: | + echo "start waverless with id $DIST_UNIQUE_ID" + # only host contains python3 + python3 gen_nodes_config.py + export RUST_LOG=debug + rm -rf ./wasm_serverless + ln -s /usr/bin/waverless ./wasm_serverless + cp /usr/bin/waverless_entry ./ + + ./waverless_entry $DIST_UNIQUE_ID diff --git a/src/main/build.rs b/src/main/build.rs index d16dc9e..b7f9e4c 100644 --- a/src/main/build.rs +++ b/src/main/build.rs @@ -1,8 +1,8 @@ use std::io::Result; fn main() -> Result<()> { let mut config = prost_build::Config::new(); - config - .type_attribute("BatchRequestId", "#[derive(Eq, Hash)]"); + config.type_attribute("FnTaskId", "#[derive(Eq, Hash)]"); + config.type_attribute("BatchRequestId", "#[derive(Eq, Hash)]"); config.compile_protos( &[ "src/general/network/proto_src/kv.proto", diff --git a/src/main/src/general/app/app_native/app_checkpoint.rs b/src/main/src/general/app/app_native/app_checkpoint.rs index e69de29..b607964 100644 --- a/src/main/src/general/app/app_native/app_checkpoint.rs +++ b/src/main/src/general/app/app_native/app_checkpoint.rs @@ -0,0 +1,180 @@ +// use std::process::Command; +use std::time::Duration; + +use futures::TryFutureExt; +use tokio::process::Command; + +use super::NativeAppFunc; +use crate::general::app::app_shared::{java, SharedInstance}; +use crate::general::app::{AppType, InstanceManager}; +use crate::general::data::m_data_general::parse_appname_from_data_uid; +use crate::{ + general::app::m_executor::{EventCtx, FnExeCtxAsync}, + result::{WSResult, WsFuncError}, +}; + +// pub fn function_checkpoint(fn_ctx: &mut FnExeCtxAsync) -> WSResult> {} + +pub struct FunctionAppCheckpoint; + +impl InstanceManager { + pub async fn update_checkpoint(&self, app_name: &str, restart: bool) -> WSResult<()> { + async fn debug_port_left() { + tracing::debug!("debug port left"); + // only for test + + let _ = Command::new("lsof") + .arg("-i:8080") + .spawn() + .expect("lsof failed") + .wait() + .await + .unwrap(); + } + let Some(instance) = self.app_instances.get(app_name) else { + tracing::warn!("InstanceNotFound when update checkpoint, {}", app_name); + return Err(WsFuncError::InstanceNotFound(app_name.to_owned()).into()); + }; + let Some(SharedInstance(ref proc_ins)) = instance.value().as_shared() else { + tracing::warn!("InstanceTypeNotMatch when update checkpoint, {}", app_name); + return Err(WsFuncError::InstanceTypeNotMatch { + app: app_name.to_owned(), + want: "shared".to_owned(), + } + .into()); + }; + // state 2 connecting, make others wait + { + proc_ins.before_checkpoint(); + tokio::time::sleep(Duration::from_secs(3)).await; + } + // take snap shot + { + tracing::debug!("taking snapshot for app: {}", app_name); + match proc_ins.app_type { + AppType::Jar => java::take_snapshot(app_name, self.view.os()).await, + AppType::Wasm | AppType::Native => { + panic!("wasm/native can't take snapshot") + } + } + } + // recover by criu + // tokio::time::sleep(Duration::from_secs(3)).await; + + tracing::debug!("restart app after snapshot: {}", app_name); + let res = java::JavaColdStart::mksure_checkpoint(self.view.os().app_path(app_name)) + .await + .cold_start(app_name, self.view.os()); + let p = match res { + Err(e) => { + tracing::warn!("cold start failed: {:?}", e); + return Err(e); + } + Ok(ok) => ok, + }; + // just update the process in old instance; because the old is dead; + // let pid = java::wait_for_pid(app_name).await?; + proc_ins.bind_process(p); + let _ = proc_ins.wait_for_verify().await; + tracing::debug!("wait_for_verify done1"); + if !restart { + tracing::debug!("don't restart after checkpoint, kill it"); + + let _ = proc_ins.kill().await; + debug_port_left().await; + // remove instance + let _ = self.app_instances.remove(app_name); + } + + Ok(()) + } + + pub async fn make_checkpoint_for_app(&self, app: &str) -> WSResult<()> { + tracing::debug!("make checkpoint for app: {}", app); + let p = self.get_process_instance(&AppType::Jar, app); + let _ = p.wait_for_verify().await; + tracing::debug!("wait_for_verify done2"); + tokio::time::sleep(Duration::from_secs(3)).await; + + self.update_checkpoint(app, false).await?; + Ok(()) + } +} + +impl NativeAppFunc for FunctionAppCheckpoint { + async fn execute( + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult> { + tracing::debug!("native app FunctionAppCheckpoint"); + // first we get the data unique id, and read it + match fn_ctx.event_ctx_mut() { + EventCtx::KvSet { key, .. } => { + tracing::debug!("native app FunctionAppCheckpoint kv set triggered"); + let Some(appname) = parse_appname_from_data_uid(key) else { + return Err(WsFuncError::FuncTriggerAppInvalid { + key: key.to_vec(), + appmeta: None, + context: "native app FunctionAppCheckpoint the trigger app key of app_checkpoint is invalid".to_string(), + } + .into()); + }; + let appmeta_ = instman + .view + .appmeta_manager() + .get_app_meta(&appname) + .await + .map_err(|err| { + tracing::error!("native app FunctionAppCheckpoint get app meta failed to checkpoint err: {:?}", err); + err + })?; + let Some((appmeta, Some(datameta))) = appmeta_ else { + return Err(WsFuncError::FuncTriggerAppInvalid { + key: key.to_vec(), + appmeta: Some((appname.to_string(), appmeta_.clone())), + context: "native app FunctionAppCheckpoint app meta not found".to_string(), + } + .into()); + }; + tracing::debug!( + "native app FunctionAppCheckpoint load appmeta done, load app file start" + ); + instman + .view + .appmeta_manager() + .load_app_file(&appname, datameta) + .await + .map_err(|err| { + tracing::error!("native app FunctionAppCheckpoint load app file failed to checkpoint err: {:?}", err); + err + })?; + tracing::debug!("native app FunctionAppCheckpoint load app file done"); + // start checkpoint + if appmeta.app_type == AppType::Jar { + instman + .make_checkpoint_for_app(&appname) + .await + .map_err(|err| { + tracing::error!("native app FunctionAppCheckpoint make checkpoint for app failed: {:?}", err); + err + })?; + } + } + _ => { + tracing::debug!( + "native app FunctionAppCheckpoint::execute not supported http calling" + ); + return Err(WsFuncError::InvalidTriggerForAppFunction { + app: fn_ctx.app_name().to_string(), + func: fn_ctx.func_name().to_string(), + trigger_type: fn_ctx.event_ctx().clone(), + } + .into()); + } + }; + + // then we start checkpoint + + Ok(None) + } +} diff --git a/src/main/src/general/app/app_native/mod.rs b/src/main/src/general/app/app_native/mod.rs index 5bf2a7e..e1a92b4 100644 --- a/src/main/src/general/app/app_native/mod.rs +++ b/src/main/src/general/app/app_native/mod.rs @@ -2,6 +2,7 @@ pub mod app_checkpoint; use std::collections::HashMap; +use super::instance::m_instance_manager::InstanceManager; use super::{ AffinityPattern, AffinityRule, AppMeta, AppType, DataAccess, DataEventTrigger, FnMeta, KeyPattern, NodeTag, @@ -11,8 +12,16 @@ use crate::general::app::m_executor::{FnExeCtxAsync, FnExeCtxSync}; use crate::general::data::m_data_general::DATA_UID_PREFIX_APP_META; use crate::new_map; use crate::result::{WSResult, WsFuncError}; +use app_checkpoint::FunctionAppCheckpoint; use async_trait::async_trait; +pub trait NativeAppFunc: Sized { + async fn execute( + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult>; +} + pub struct NativeAppInstance { _dummy_private: (), // avoid empty struct } @@ -29,12 +38,43 @@ impl InstanceTrait for NativeAppInstance { fn instance_name(&self) -> String { "native_app_dummy_instance".to_string() } - async fn execute(&self, _fn_ctx: &mut FnExeCtxAsync) -> WSResult> { + async fn execute( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult> { // Native apps don't support async execution - Err(WsFuncError::UnsupportedAppType.into()) + // Err(WsFuncError::UnsupportedAppType.into()) + let res = match fn_ctx.app_name() { + "app_checkpoint" => match fn_ctx.func_name() { + // "checkpointable" => app_checkpoint::checkpointable(fn_ctx), + "checkpoint" => Some(FunctionAppCheckpoint::execute(instman, fn_ctx).await?), + _ => None, + }, + _ => { + return Err(WsFuncError::AppNotFound { + app: fn_ctx.app_name().to_string(), + } + .into()) + } + }; + + let Some(res) = res else { + return Err(WsFuncError::FuncNotFound { + app: fn_ctx.app_name().to_string(), + func: fn_ctx.func_name().to_string(), + } + .into()); + }; + + Ok(res) } - fn execute_sync(&self, _fn_ctx: &mut FnExeCtxSync) -> WSResult> { + fn execute_sync( + &self, + _instman: &InstanceManager, + _fn_ctx: &mut FnExeCtxSync, + ) -> WSResult> { // For now, just return None as native apps don't produce results todo!() // Ok(None) diff --git a/src/main/src/general/app/app_owned/mod.rs b/src/main/src/general/app/app_owned/mod.rs index 615bf55..0cfa113 100644 --- a/src/main/src/general/app/app_owned/mod.rs +++ b/src/main/src/general/app/app_owned/mod.rs @@ -4,9 +4,11 @@ pub mod wasm_host_funcs; use crate::general::app::instance::InstanceTrait; use crate::general::app::instance::OwnedInstance; use crate::general::app::m_executor::{FnExeCtxAsync, FnExeCtxSync}; -use crate::result::{WSResult}; +use crate::result::WSResult; use async_trait::async_trait; +use super::instance::m_instance_manager::InstanceManager; + #[async_trait] impl InstanceTrait for OwnedInstance { fn instance_name(&self) -> String { @@ -14,15 +16,23 @@ impl InstanceTrait for OwnedInstance { OwnedInstance::WasmInstance(v) => v.instance_name(), } } - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> WSResult> { + async fn execute( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult> { match self { - OwnedInstance::WasmInstance(v) => v.execute(fn_ctx).await, + OwnedInstance::WasmInstance(v) => v.execute(instman, fn_ctx).await, } } - fn execute_sync(&self, fn_ctx: &mut FnExeCtxSync) -> WSResult> { + fn execute_sync( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxSync, + ) -> WSResult> { match self { - OwnedInstance::WasmInstance(v) => v.execute_sync(fn_ctx), + OwnedInstance::WasmInstance(v) => v.execute_sync(instman, fn_ctx), } } } diff --git a/src/main/src/general/app/app_owned/wasm.rs b/src/main/src/general/app/app_owned/wasm.rs index 183339f..4771f49 100644 --- a/src/main/src/general/app/app_owned/wasm.rs +++ b/src/main/src/general/app/app_owned/wasm.rs @@ -1,4 +1,5 @@ use crate::general::app::app_owned::wasm_host_funcs; +use crate::general::app::instance::m_instance_manager::InstanceManager; use crate::general::app::instance::InstanceTrait; use crate::general::app::instance::OwnedInstance; use crate::general::app::m_executor::{EventCtx, FnExeCtxAsync, FnExeCtxBase, FnExeCtxSync}; @@ -70,7 +71,11 @@ impl InstanceTrait for WasmInstance { .next() .unwrap() } - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> WSResult> { + async fn execute( + &self, + _instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult> { #[cfg(target_os = "linux")] { let mut final_err = None; @@ -126,7 +131,11 @@ impl InstanceTrait for WasmInstance { /// WASM instances don't support synchronous execution /// See [`FnExeCtxSyncAllowedType`] for supported types (currently only Native) - fn execute_sync(&self, _fn_ctx: &mut FnExeCtxSync) -> WSResult> { + fn execute_sync( + &self, + _instman: &InstanceManager, + _fn_ctx: &mut FnExeCtxSync, + ) -> WSResult> { Err(WsFuncError::UnsupportedAppType.into()) } } diff --git a/src/main/src/general/app/app_shared/java.rs b/src/main/src/general/app/app_shared/java.rs index 432edf5..b173538 100644 --- a/src/main/src/general/app/app_shared/java.rs +++ b/src/main/src/general/app/app_shared/java.rs @@ -9,12 +9,12 @@ use crate::{ use super::process::PID; -pub(super) struct JavaColdStart { +pub(crate) struct JavaColdStart { _dummy_private: (), } impl JavaColdStart { - pub(super) async fn mksure_checkpoint(appdir: PathBuf) -> Self { + pub(crate) async fn mksure_checkpoint(appdir: PathBuf) -> Self { let mut i = 0; loop { // if dir not exist, continue @@ -63,7 +63,7 @@ impl JavaColdStart { Self { _dummy_private: () } } - pub(super) fn cold_start(self, app: &str, os: &OperatingSystem) -> WSResult { + pub(crate) fn cold_start(self, app: &str, os: &OperatingSystem) -> WSResult { tracing::debug!("java cold start {}", app); let p = os.start_process(OsProcessType::JavaApp(app.to_owned())); Ok(p) @@ -96,7 +96,7 @@ pub(super) async fn find_pid(app: &str) -> WSResult { Ok(pid) } -pub(super) async fn take_snapshot(app: &str, os: &OperatingSystem) { +pub async fn take_snapshot(app: &str, os: &OperatingSystem) { let res = os .start_process(OsProcessType::JavaCheckpoints(app.to_owned())) .wait() diff --git a/src/main/src/general/app/app_shared/mod.rs b/src/main/src/general/app/app_shared/mod.rs index 4a10431..9716879 100644 --- a/src/main/src/general/app/app_shared/mod.rs +++ b/src/main/src/general/app/app_shared/mod.rs @@ -7,6 +7,8 @@ use crate::general::app::instance::InstanceTrait; use crate::general::app::m_executor::{FnExeCtxAsync, FnExeCtxSync}; use async_trait::async_trait; +use super::instance::m_instance_manager::InstanceManager; + pub struct SharedInstance(pub process::ProcessInstance); impl From for SharedInstance { @@ -20,10 +22,18 @@ impl InstanceTrait for SharedInstance { fn instance_name(&self) -> String { self.0.instance_name() } - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> crate::result::WSResult> { - self.0.execute(fn_ctx).await + async fn execute( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> crate::result::WSResult> { + self.0.execute(instman, fn_ctx).await } - fn execute_sync(&self, fn_ctx: &mut FnExeCtxSync) -> crate::result::WSResult> { - self.0.execute_sync(fn_ctx) + fn execute_sync( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxSync, + ) -> crate::result::WSResult> { + self.0.execute_sync(instman, fn_ctx) } } diff --git a/src/main/src/general/app/app_shared/process.rs b/src/main/src/general/app/app_shared/process.rs index 2f96d6a..9d0859b 100644 --- a/src/main/src/general/app/app_shared/process.rs +++ b/src/main/src/general/app/app_shared/process.rs @@ -2,13 +2,14 @@ use super::process_rpc::{self, proc_proto}; use crate::general::app::app_shared::java; +use crate::general::app::instance::m_instance_manager::InstanceManager; use crate::general::app::instance::InstanceTrait; use crate::general::app::m_executor::{FnExeCtxAsync, FnExeCtxBase, FnExeCtxSync}; use crate::general::{ app::AppType, network::rpc_model::{self, HashValue}, }; -use crate::result::{WsFuncError}; +use crate::result::WsFuncError; use async_trait::async_trait; use enum_as_inner::EnumAsInner; use parking_lot::RwLock; @@ -164,17 +165,19 @@ impl ProcessInstance { } }; tracing::debug!("connecting, wait for verify"); - waiter.await.expect( + let v = waiter.await.expect( "tx lives in ProcessInstanceStateInner::Connecting and destroyed when { notify all the waiters then transfer to Connected } so it's impossible to drop the tx before the rx", - ) + ); + tracing::debug!("verify received"); // let (tx, rx) = oneshot::channel(); // let mut wating_verify = WATING_VERIFY.lock(); // let wating = wating_verify.entry(self.app.clone()).or_insert_with(Vec::new); // wating.push(tx); // drop(wating_verify); // let _ = rx.await; + v } pub fn before_checkpoint(&self) { // state to starting @@ -200,7 +203,11 @@ impl InstanceTrait for ProcessInstance { fn instance_name(&self) -> String { self.app.clone() } - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> crate::result::WSResult> { + async fn execute( + &self, + _instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> crate::result::WSResult> { let _ = self.wait_for_verify().await; tracing::debug!( "wait_for_verify done, call app:{}, func:{}", @@ -216,7 +223,11 @@ impl InstanceTrait for ProcessInstance { /// Process instances don't support synchronous execution /// See [`FnExeCtxSyncAllowedType`] for supported types (currently only Native) - fn execute_sync(&self, _fn_ctx: &mut FnExeCtxSync) -> crate::result::WSResult> { + fn execute_sync( + &self, + _instman: &InstanceManager, + _fn_ctx: &mut FnExeCtxSync, + ) -> crate::result::WSResult> { Err(WsFuncError::UnsupportedAppType.into()) } } diff --git a/src/main/src/general/app/app_shared/process_instance_man_related.rs b/src/main/src/general/app/app_shared/process_instance_man_related.rs index ed8e9e7..ddd4818 100644 --- a/src/main/src/general/app/app_shared/process_instance_man_related.rs +++ b/src/main/src/general/app/app_shared/process_instance_man_related.rs @@ -13,88 +13,6 @@ use crate::{ }; impl InstanceManager { - pub async fn update_checkpoint(&self, app_name: &str, restart: bool) -> WSResult<()> { - async fn debug_port_left() { - tracing::debug!("debug port left"); - // only for test - - let _ = Command::new("lsof") - .arg("-i:8080") - .spawn() - .expect("lsof failed") - .wait() - .await - .unwrap(); - } - let Some(instance) = self.app_instances.get(app_name) else { - tracing::warn!("InstanceNotFound when update checkpoint, {}", app_name); - return Err(WsFuncError::InstanceNotFound(app_name.to_owned()).into()); - }; - let Some(SharedInstance(ref proc_ins)) = instance.value().as_shared() else { - tracing::warn!("InstanceTypeNotMatch when update checkpoint, {}", app_name); - return Err(WsFuncError::InstanceTypeNotMatch { - app: app_name.to_owned(), - want: "shared".to_owned(), - } - .into()); - }; - // state 2 connecting, make others wait - { - proc_ins.before_checkpoint(); - tokio::time::sleep(Duration::from_secs(3)).await; - } - // take snap shot - { - tracing::debug!("taking snapshot for app: {}", app_name); - match proc_ins.app_type { - AppType::Jar => java::take_snapshot(app_name, self.view.os()).await, - AppType::Wasm | AppType::Native => { - panic!("wasm/native can't take snapshot") - } - } - } - // recover by criu - // tokio::time::sleep(Duration::from_secs(3)).await; - - tracing::debug!("restart app after snapshot: {}", app_name); - let res = java::JavaColdStart::mksure_checkpoint(self.view.os().app_path(app_name)) - .await - .cold_start(app_name, self.view.os()); - let p = match res { - Err(e) => { - tracing::warn!("cold start failed: {:?}", e); - return Err(e); - } - Ok(ok) => ok, - }; - // just update the process in old instance; because the old is dead; - // let pid = java::wait_for_pid(app_name).await?; - proc_ins.bind_process(p); - let _ = proc_ins.wait_for_verify().await; - tracing::debug!("wait_for_verify done1"); - if !restart { - tracing::debug!("don't restart after checkpoint, kill it"); - - let _ = proc_ins.kill().await; - debug_port_left().await; - // remove instance - let _ = self.app_instances.remove(app_name); - } - - Ok(()) - } - - pub async fn make_checkpoint_for_app(&self, app: &str) -> WSResult<()> { - tracing::debug!("make checkpoint for app: {}", app); - let p = self.get_process_instance(&AppType::Jar, app); - let _ = p.wait_for_verify().await; - tracing::debug!("wait_for_verify done2"); - tokio::time::sleep(Duration::from_secs(3)).await; - - self.update_checkpoint(app, false).await?; - Ok(()) - } - /// # Panics /// We call it when we alreay know it's a process /// diff --git a/src/main/src/general/app/app_shared/process_rpc.rs b/src/main/src/general/app/app_shared/process_rpc.rs index e7b115e..9c2dd2a 100644 --- a/src/main/src/general/app/app_shared/process_rpc.rs +++ b/src/main/src/general/app/app_shared/process_rpc.rs @@ -4,6 +4,7 @@ pub mod proc_proto { use self::proc_proto::{FuncCallReq, FuncCallResp}; use super::SharedInstance; +use crate::general::app; use crate::general::app::app_shared::process_rpc::proc_proto::AppStarted; use crate::{ general::network::rpc_model::{self, HashValue, MsgIdBind, ReqMsg, RpcCustom}, @@ -14,6 +15,7 @@ use crate::{ use async_trait::async_trait; use parking_lot::Mutex; use prost::Message; +use std::sync::Arc; use std::{collections::HashMap, path::Path, time::Duration}; use tokio::sync::oneshot; @@ -23,7 +25,16 @@ fn clean_sock_file(path: impl AsRef) { let _ = std::fs::remove_file(path); } -pub struct ProcessRpc; +// pub struct ProcessRpcInner(); + +#[derive(Clone)] +pub struct ProcessRpc(Arc); + +impl ProcessRpc { + pub fn new(app: app::View) -> Self { + ProcessRpc(Arc::new(app)) + } +} lazy_static::lazy_static! { static ref WATING_VERIFY: Mutex>>>=Mutex::new(HashMap::new()); @@ -49,7 +60,7 @@ impl RpcCustom for ProcessRpc { // }; // } - async fn verify(buf: &[u8]) -> Option { + async fn verify(&self, buf: &[u8]) -> Option { let res = proc_proto::AppStarted::decode(buf); let res: proc_proto::AppStarted = match res { Ok(res) => res, @@ -86,11 +97,16 @@ impl RpcCustom for ProcessRpc { // } // update to the instance - let insman = ProcessRpc::global_m_instance_manager(); - let instance = insman.app_instances.get(&res.appid).expect(&format!( - "instance should be inited before get the verify {}", - res.appid - )); + // let insman = ProcessRpc::global_m_instance_manager(); + let instance = self + .0 + .instance_manager() + .app_instances + .get(&res.appid) + .expect(&format!( + "instance should be inited before get the verify {}", + res.appid + )); let Some(s): Option<&SharedInstance> = instance.value().as_shared() else { tracing::warn!("only receive the verify from the instance that is shared"); return None; diff --git a/src/main/src/general/app/http.rs b/src/main/src/general/app/http.rs index aab77e3..4ffb218 100644 --- a/src/main/src/general/app/http.rs +++ b/src/main/src/general/app/http.rs @@ -47,20 +47,6 @@ async fn call_app_fn(Path((app, func)): Path<(String, String)>, body: String) -> .http_handler() .handle_request(&format!("{app}/{func}"), body) .await - } else if !view() - .appmeta_manager() - .app_available(&app) - .await - .map_or_else( - |e| { - tracing::debug!("failed to get app available, e:{:?}", e); - false - }, - |v| v, - ) - { - // # check app valid - StatusCode::BAD_REQUEST.into_response() } else { // # call instance run let req_arrive_time = SystemTime::now() @@ -69,7 +55,7 @@ async fn call_app_fn(Path((app, func)): Path<(String, String)>, body: String) -> .as_millis() as u64; let res = view() .executor() - .handle_http_task(&format!("{app}/{func}"), body) + .handle_http_task(&app, &func, body) // .execute_http_app(FunctionCtxBuilder::new( // app.to_owned(), // self.local_req_id_allocator.alloc(), @@ -114,6 +100,15 @@ async fn upload_app(mut multipart: Multipart) -> Response { // let content_type = field.content_type().unwrap().to_string(); let data = field.bytes().await.unwrap(); + #[cfg(test)] + { + *view().appmeta_manager().test_http_app_uploaded.lock() = data.clone(); + } + + // if *view().appmeta_manager().test_http_app_uploaded.lock().unwrap() != data { + // panic!("app_uploaded failed!"); + // } + let name2 = name.clone(); let task = tokio::spawn(async move { view().appmeta_manager().app_uploaded(name2, data).await }); diff --git a/src/main/src/general/app/instance/m_instance_manager.rs b/src/main/src/general/app/instance/m_instance_manager.rs index 1d07c3e..1e8c9cc 100644 --- a/src/main/src/general/app/instance/m_instance_manager.rs +++ b/src/main/src/general/app/instance/m_instance_manager.rs @@ -1,3 +1,4 @@ +use crate::general::app; use crate::general::app::app_native::NativeAppInstance; use crate::general::app::app_owned::wasm; use crate::general::app::app_shared::process_rpc::ProcessRpc; @@ -5,10 +6,17 @@ use crate::general::app::app_shared::SharedInstance; use crate::general::app::instance::Instance; use crate::general::app::m_executor::FnExeCtxAsync; use crate::general::app::m_executor::FnExeCtxSync; +use crate::general::app::AppMetaManager; use crate::general::m_os::OperatingSystem; +use crate::general::network::m_p2p::P2PModule; +use crate::general::network::m_p2p::RPCCaller; +use crate::general::network::m_p2p::RPCHandler; +use crate::general::network::proto; use crate::general::network::rpc_model; +use crate::result::WSResultExt; use crate::result::{WSError, WsFuncError}; use crate::sys::LogicalModulesRef; +use crate::sys::NodeID; use crate::{ general::app::AppType, // worker::host_funcs, result::WSResult, @@ -31,6 +39,7 @@ use std::{ time::Duration, }; use tokio::io::AsyncWriteExt; +use tokio::sync::broadcast; use tokio::sync::Notify; use ws_derive::LogicalModule; @@ -218,6 +227,9 @@ pub struct InstanceManager { logical_module_view_impl!(InstanceManagerView); logical_module_view_impl!(InstanceManagerView, os, OperatingSystem); +logical_module_view_impl!(InstanceManagerView, p2p, P2PModule); +logical_module_view_impl!(InstanceManagerView, appmeta_manager, AppMetaManager); +logical_module_view_impl!(InstanceManagerView, instance_manager, InstanceManager); pub enum UnsafeFunctionCtx { Sync(NonNull), @@ -289,6 +301,7 @@ action: close", // start process rpc Ok(vec![rpc_model::spawn::( + ProcessRpc::new(app::View::new(self.view.copy_module_ref())), self.file_dir .join("agent.sock") .to_str() diff --git a/src/main/src/general/app/instance/mod.rs b/src/main/src/general/app/instance/mod.rs index 4197bce..2f09fea 100644 --- a/src/main/src/general/app/instance/mod.rs +++ b/src/main/src/general/app/instance/mod.rs @@ -8,6 +8,7 @@ use crate::general::app::app_shared::process::ProcessInstance; use crate::result::WSResult; use async_trait::async_trait; use enum_as_inner::EnumAsInner; +use m_instance_manager::InstanceManager; #[derive(EnumAsInner)] pub enum OwnedInstance { @@ -46,19 +47,27 @@ impl InstanceTrait for Instance { Instance::Native(v) => v.instance_name(), } } - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> WSResult> { + async fn execute( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult> { match self { - Instance::Owned(v) => v.execute(fn_ctx).await, - Instance::Shared(v) => v.execute(fn_ctx).await, - Instance::Native(v) => v.execute(fn_ctx).await, + Instance::Owned(v) => v.execute(instman, fn_ctx).await, + Instance::Shared(v) => v.execute(instman, fn_ctx).await, + Instance::Native(v) => v.execute(instman, fn_ctx).await, } } - fn execute_sync(&self, fn_ctx: &mut FnExeCtxSync) -> WSResult> { + fn execute_sync( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxSync, + ) -> WSResult> { match self { - Instance::Owned(v) => v.execute_sync(fn_ctx), - Instance::Shared(v) => v.execute_sync(fn_ctx), - Instance::Native(v) => v.execute_sync(fn_ctx), + Instance::Owned(v) => v.execute_sync(instman, fn_ctx), + Instance::Shared(v) => v.execute_sync(instman, fn_ctx), + Instance::Native(v) => v.execute_sync(instman, fn_ctx), } } } @@ -68,6 +77,14 @@ pub enum NewJavaInstanceConfig {} #[async_trait] pub trait InstanceTrait { fn instance_name(&self) -> String; - async fn execute(&self, fn_ctx: &mut FnExeCtxAsync) -> WSResult>; - fn execute_sync(&self, fn_ctx: &mut FnExeCtxSync) -> WSResult>; + async fn execute( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxAsync, + ) -> WSResult>; + fn execute_sync( + &self, + instman: &InstanceManager, + fn_ctx: &mut FnExeCtxSync, + ) -> WSResult>; } diff --git a/src/main/src/general/app/m_executor.rs b/src/main/src/general/app/m_executor.rs index 2c5ddf4..ff8467f 100644 --- a/src/main/src/general/app/m_executor.rs +++ b/src/main/src/general/app/m_executor.rs @@ -3,7 +3,12 @@ use crate::general::app::instance::m_instance_manager::UnsafeFunctionCtx; use crate::general::app::instance::InstanceTrait; use crate::general::app::AppType; use crate::general::app::FnMeta; +use crate::general::network::m_p2p::RPCCaller; +use crate::general::network::m_p2p::TaskId; +use crate::general::network::proto::FnTaskId; use crate::result::WSError; +use crate::result::WSResultExt; +use crate::sys::NodeID; use crate::{ general::{ app::AppMetaManager, @@ -11,8 +16,7 @@ use crate::{ http_handler::ReqId, m_p2p::{P2PModule, RPCHandler, RPCResponsor}, proto::{ - self, - sche::{distribute_task_req, DistributeTaskResp}, + self, {distribute_task_req, DistributeTaskResp}, }, }, }, @@ -22,26 +26,33 @@ use crate::{ util::JoinHandleWrapper, }; use async_trait::async_trait; +use dashmap::DashMap; +use std::time::Duration; use std::{ ptr::NonNull, sync::atomic::{AtomicU32, AtomicUsize}, time::{SystemTime, UNIX_EPOCH}, }; +use tokio::sync::broadcast; use tokio::sync::oneshot; use tokio::task::JoinHandle; #[cfg(target_os = "linux")] use ws_derive::LogicalModule; -pub type SubTaskId = u32; +// pub type SubTaskId = u32; -pub type SubTaskNotifier = oneshot::Sender; +// pub type SubTaskNotifier = oneshot::Sender; -pub type SubTaskWaiter = oneshot::Receiver; +// pub type SubTaskWaiter = oneshot::Receiver; #[derive(Clone, Debug)] pub enum EventCtx { Http(String), - KvSet { key: Vec, opeid: Option }, + KvSet { + key: Vec, + opeid: Option, + src_task_id: proto::FnTaskId, + }, } impl EventCtx { @@ -58,11 +69,12 @@ struct FnExeCtx { pub app_type: AppType, pub func: String, pub _func_meta: FnMeta, - pub _req_id: ReqId, + // pub _req_id: ReqId, + pub task_id: FnTaskId, pub event_ctx: EventCtx, pub res: Option, /// remote scheduling tasks - pub sub_waiters: Vec>, // pub trigger_node: NodeID, + // pub sub_waiters: Vec>, // pub trigger_node: NodeID, _dummy_private: (), } @@ -103,17 +115,17 @@ impl FnExeCtxAsync { app: String, func: String, func_meta: FnMeta, - req_id: ReqId, + task_id: FnTaskId, event_ctx: EventCtx, ) -> Self { Self { inner: FnExeCtx { app, func, - _req_id: req_id, + task_id, event_ctx, res: None, - sub_waiters: vec![], + // sub_waiters: vec![], app_type: apptype.into(), _func_meta: func_meta, _dummy_private: (), @@ -121,6 +133,10 @@ impl FnExeCtxAsync { } } + pub fn task_id(&self) -> &FnTaskId { + &self.inner.task_id + } + pub fn event_ctx(&self) -> &EventCtx { &self.inner.event_ctx } @@ -146,6 +162,22 @@ impl FnExeCtxAsync { pub fn take_result(&mut self) -> Option { self.inner.res.take() } + + pub fn app_name(&self) -> &str { + &self.inner.app + } + + pub fn func_name(&self) -> &str { + &self.inner.func + } + + pub fn func_meta(&self) -> &FnMeta { + &self.inner._func_meta + } + + pub fn event_ctx_mut(&mut self) -> &mut EventCtx { + &mut self.inner.event_ctx + } } pub enum FnExeCtxSyncAllowedType { @@ -178,17 +210,18 @@ impl FnExeCtxSync { app: String, func: String, func_meta: FnMeta, - req_id: ReqId, + // req_id: ReqId, + task_id: FnTaskId, event_ctx: EventCtx, ) -> Self { Self { inner: FnExeCtx { app, func, - _req_id: req_id, + task_id, event_ctx, res: None, - sub_waiters: vec![], + // sub_waiters: vec![], app_type: apptype.into(), _func_meta: func_meta, _dummy_private: (), @@ -222,9 +255,19 @@ logical_module_view_impl!(ExecutorView, executor, Executor); #[derive(LogicalModule)] pub struct Executor { sub_task_id: AtomicU32, - rpc_handler_distribute_task: RPCHandler, - next_req_id: AtomicUsize, + // next_req_id: AtomicUsize, view: ExecutorView, + + // src task id -> [(task run node, task id)] + task_subwait_for: DashMap>, + + // this runing task id -> src waiting rpc + task_subwait_by: DashMap>, + + rpc_handler_distribute_task: RPCHandler, + rpc_caller_listen_for_task_done: RPCCaller, + rpc_handler_listen_for_task_done: RPCHandler, + rpc_handler_add_wait_target: RPCHandler, } /// Base trait for function execution contexts @@ -269,6 +312,151 @@ impl FnExeCtxBase for FnExeCtxSync { } } +impl Executor { + pub async fn wait_for_subtasks(&self, thistask: &u32) { + let mut done_tasks = vec![]; + loop { + if !self.task_subwait_for.contains_key(&thistask) { + tracing::debug!( + "src task {} wait_for_subtasks with {:?}", + thistask, + done_tasks + ); + break; + } + let mut wait_tasks = Vec::new(); + while let Some((_thistask, node_tasks)) = self.task_subwait_for.remove(&thistask) { + for (node, task) in node_tasks { + done_tasks.push(task.clone()); + let view = self.view.clone(); + let wait_task = tokio::spawn(async move { + let _ = view + .executor() + .rpc_caller_listen_for_task_done + .call( + view.p2p(), + node, + proto::ListenForTaskDoneReq { + task_id: Some(task), + }, + Some(Duration::from_secs(180)), + ) + .await; + }); + wait_tasks.push(wait_task); + } + } + for wait_task in wait_tasks { + let _ = wait_task.await; + } + } + } + pub fn notify_subwait_done(&self, taskid: &FnTaskId) { + loop { + if let Some((_, sender)) = self.task_subwait_by.remove(&taskid) { + let _ = sender.send(()); + } + return; + } + } + // pub fn take_subwaitings_for_task(&self, taskid: &FnTaskId) -> Option> { + // self.task_subwait_by.remove(&taskid).map(|res| res.1) + // } + async fn start_rpc(&self) -> WSResult<()> { + self.rpc_caller_listen_for_task_done.regist(self.view.p2p()); + { + let view = self.view.clone(); + self.view.executor().rpc_handler_distribute_task.regist( + self.view.p2p(), + move |responser, r| { + // tracing::info!("rpc recv: {:?}", r); + let view = view.clone(); + let _ = tokio::spawn(async move { + view.executor().handle_distribute_task(responser, r).await; + + // if let Err(err) = responser + // .send_resp(proto::sche::DistributeTaskResp {}) + // .await + // { + // tracing::error!("send sche resp failed with err: {}", err); + // } + }); + Ok(()) + }, + ); + } + { + // after some function done, check the waiting list + // beingg the listen for task done caller + let view = self.view.clone(); + let _ = + self.rpc_handler_add_wait_target + .regist(self.view.p2p(), move |responsor, req| { + let view = view.clone(); + let _ = tokio::spawn(async move { + view.executor() + .task_subwait_for + .entry(req.src_task_id) + .or_insert_with(|| vec![]) + .push((req.task_run_node, req.sub_task_id.unwrap())); + // view.executor().handle_add_wait_target(responsor,req).await; + let _ = responsor + .send_resp(proto::AddWaitTargetResp { + success: true, + err_msg: "".to_owned(), + }) + .await + .todo_handle("add wait target"); + }); + Ok(()) + }); + } + // { + // self.rpc_caller_add_wait_target.regist(self.view.p2p()); + // } + { + let view = self.view.clone(); + self.rpc_handler_listen_for_task_done + .regist(self.view.p2p(), move |responsor, req| { + let view = view.clone(); + let _ = tokio::spawn(async move { + tracing::debug!("listen for task done: {:?}", req.task_id); + let mut sub = { + view.executor() + .task_subwait_by + .entry(req.task_id.unwrap()) + .or_insert_with(|| broadcast::channel(16).0) + .subscribe() + }; + let res = sub.recv().await; + tracing::debug!("task is done: {:?}", res); + if res.is_ok() { + let _ = responsor + .send_resp(proto::ListenForTaskDoneResp { + success: true, + err_msg: "".to_owned(), + }) + .await + .todo_handle("listen task done"); + } else { + tracing::warn!("listen task done failed: {:?}", res); + let _ = responsor + .send_resp(proto::ListenForTaskDoneResp { + success: false, + err_msg: format!("err:{:?}", res), + }) + .await + .todo_handle("listen task done"); + } + }); + Ok(()) + }); + } + + Ok(()) + } +} + #[async_trait] impl LogicalModule for Executor { fn inner_new(args: LogicalModuleNewArgs) -> Self @@ -279,29 +467,17 @@ impl LogicalModule for Executor { rpc_handler_distribute_task: RPCHandler::default(), view: ExecutorView::new(args.logical_modules_ref.clone()), sub_task_id: AtomicU32::new(0), - next_req_id: AtomicUsize::new(0), + // next_req_id: AtomicUsize::new(0), + rpc_caller_listen_for_task_done: RPCCaller::new(), + rpc_handler_listen_for_task_done: RPCHandler::new(), + rpc_handler_add_wait_target: RPCHandler::new(), + + task_subwait_by: DashMap::new(), + task_subwait_for: DashMap::new(), } } async fn start(&self) -> WSResult> { - let view = self.view.clone(); - self.view.executor().rpc_handler_distribute_task.regist( - self.view.p2p(), - move |responser, r| { - // tracing::info!("rpc recv: {:?}", r); - let view = view.clone(); - let _ = tokio::spawn(async move { - view.executor().handle_distribute_task(responser, r).await; - - // if let Err(err) = responser - // .send_resp(proto::sche::DistributeTaskResp {}) - // .await - // { - // tracing::error!("send sche resp failed with err: {}", err); - // } - }); - Ok(()) - }, - ); + self.start_rpc().await?; // self.view // .p2p() // .regist_rpc::(); @@ -310,11 +486,15 @@ impl LogicalModule for Executor { } impl Executor { - pub fn register_sub_task(&self) -> SubTaskId { + pub fn register_sub_task(&self) -> proto::FnTaskId { let taskid = self .sub_task_id .fetch_add(1, std::sync::atomic::Ordering::Relaxed); - taskid + // (self.view.p2p().nodes_config.this_node(), taskid) + FnTaskId { + call_node_id: self.view.p2p().nodes_config.this_node(), + task_id: taskid, + } } pub async fn local_call_execute_async(&self, ctx: FnExeCtxAsync) -> WSResult> { @@ -327,10 +507,12 @@ impl Executor { pub async fn handle_distribute_task( &self, - resp: RPCResponsor, - req: proto::sche::DistributeTaskReq, + resp: RPCResponsor, + req: proto::DistributeTaskReq, ) { tracing::debug!("receive distribute task: {:?}", req); + // alert src to wait for this task + let app = req.app.to_owned(); let func = req.func.to_owned(); // todo @@ -451,7 +633,8 @@ impl Executor { //如果函数支持同步 // construct sync fn exe ctx let ctx = FnExeCtxSync::new( - match FnExeCtxAsyncAllowedType::try_from(apptype) { // 这里修正为 FnExeCtxAsyncAllowedType + match FnExeCtxAsyncAllowedType::try_from(apptype) { + // 这里修正为 FnExeCtxAsyncAllowedType Ok(v) => v, Err(err) => { let warn = format!("app type {:?} not supported, err: {}", apptype, err); @@ -471,15 +654,17 @@ impl Executor { req.app, req.func, fnmeta.clone(), - req.task_id as usize, + req.task_id.unwrap(), // as TaskId, match req.trigger.unwrap() { distribute_task_req::Trigger::EventNew(new) => EventCtx::KvSet { key: new.key, opeid: Some(new.opeid), + src_task_id: req.trigger_src_task_id.unwrap(), }, distribute_task_req::Trigger::EventWrite(write) => EventCtx::KvSet { key: write.key, opeid: Some(write.opeid), + src_task_id: req.trigger_src_task_id.unwrap(), }, }, ); @@ -493,10 +678,9 @@ impl Executor { { tracing::error!("send sche resp for app:{app} fn:{func} failed with err: {err}"); } + let taskid = ctx.inner.task_id.clone(); let _ = self.execute_sync(ctx); - - - + self.notify_subwait_done(&taskid); } else { //如果函数支持异步 // construct async fn exe ctx @@ -521,15 +705,17 @@ impl Executor { req.app, req.func, fnmeta.clone(), - req.task_id as usize, + req.task_id.unwrap(), match req.trigger.unwrap() { distribute_task_req::Trigger::EventNew(new) => EventCtx::KvSet { key: new.key, opeid: Some(new.opeid), + src_task_id: req.trigger_src_task_id.unwrap(), }, distribute_task_req::Trigger::EventWrite(write) => EventCtx::KvSet { key: write.key, opeid: Some(write.opeid), + src_task_id: req.trigger_src_task_id.unwrap(), }, }, ); @@ -543,43 +729,26 @@ impl Executor { { tracing::error!("send sche resp for app:{app} fn:{func} failed with err: {err}"); } + + let taskid = ctx.task_id().clone(); let _ = self.execute(ctx).await; + // notify src task + self.notify_subwait_done(&taskid); + // self.take_subwaitings_for_task(&ctx.task_id()) } - - - - } - pub async fn handle_http_task(&self, route: &str, text: String) -> WSResult> { - let req_id: ReqId = self - .next_req_id - .fetch_add(1, std::sync::atomic::Ordering::SeqCst); - - //////////////////////////////////////////////////// - // route format //////////////////////////////////// - // format route, remove last / - let route = if route.ends_with('/') { - &route[..route.len() - 1] - } else { - route - }; - let split = route.split("/").into_iter().collect::>(); - // check path ok - if split.len() != 2 { - tracing::warn!( - "route {} not support, only support appname/funcname now", - route - ); - return Err(WsFuncError::InvalidHttpUrl(route.to_owned()).into()); - } - - ///////////////////////////////////////////////// - // existence //////////////////////////////////// - // trigger app - let appname = split[0]; - let funcname = split[1]; + /// before call this, verify app and func exist + pub async fn handle_http_task( + &self, + appname: &str, + funcname: &str, + text: String, + ) -> WSResult> { + // let req_id: ReqId = self + // .next_req_id + // .fetch_add(1, std::sync::atomic::Ordering::SeqCst); // check app exist tracing::debug!("calling get_app_meta to check app exist, app: {}", appname); @@ -607,7 +776,11 @@ impl Executor { self.view .appmeta_manager() .load_app_file(appname, datameta) - .await?; + .await + .map_err(|e| { + tracing::error!("load app file failed with err: {}", e); + e + })?; } ///////////////////////////////////////////////// @@ -633,14 +806,15 @@ impl Executor { ///////////////////////////////////////////////// // prepare ctx and run ////////////////////////// + let task_id = self.register_sub_task(); - if func.sync_async.asyncable() { + let res = if func.sync_async.asyncable() { let ctx = FnExeCtxAsync::new( FnExeCtxAsyncAllowedType::try_from(appmeta.app_type.clone()).unwrap(), appname.to_owned(), funcname.to_owned(), func.clone(), - req_id, + task_id.clone(), EventCtx::Http(text), ); self.execute(ctx).await @@ -650,12 +824,17 @@ impl Executor { appname.to_owned(), funcname.to_owned(), func.clone(), - req_id, + task_id.clone(), EventCtx::Http(text), ); self.execute_sync(ctx) - } + }; + + // wait for sub tasks done + self.wait_for_subtasks(&task_id.task_id).await; + + res } // pub async fn execute_http_app(&self, fn_ctx_builder: FunctionCtxBuilder) { // let app_meta_man = self.view.instance_manager().app_meta_manager.read().await; @@ -718,7 +897,7 @@ impl Executor { .as_millis() as u64; tracing::debug!("start execute sync"); - let res = instance.execute_sync(&mut ctx)?; + let res = instance.execute_sync(self.view.instance_manager(), &mut ctx)?; let res = res.map(|v| { let mut res: serde_json::Value = serde_json::from_str(&*v).unwrap(); @@ -781,7 +960,9 @@ impl Executor { .as_millis() as u64; tracing::debug!("start execute"); - let res = instance.execute(&mut fn_ctx).await; + let res = instance + .execute(self.view.instance_manager(), &mut fn_ctx) + .await; let res = res.map(|v| { v.map(|v| { @@ -807,9 +988,9 @@ impl Executor { res ); - while let Some(t) = fn_ctx.inner.sub_waiters.pop() { - let _ = t.await.unwrap(); - } + // while let Some(t) = fn_ctx.inner.sub_waiters.pop() { + // let _ = t.await.unwrap(); + // } self.view .instance_manager() diff --git a/src/main/src/general/app/mod.rs b/src/main/src/general/app/mod.rs index be46a7e..5787969 100644 --- a/src/main/src/general/app/mod.rs +++ b/src/main/src/general/app/mod.rs @@ -6,7 +6,6 @@ pub mod instance; pub mod m_executor; pub mod v_os; -use std::path::PathBuf; use super::data::m_data_general::{DataSetMetaV2, GetOrDelDataArg, GetOrDelDataArgType}; use super::m_os::APPS_REL_DIR; use crate::general::app::app_native::native_apps; @@ -44,7 +43,10 @@ use async_trait::async_trait; use axum::body::Bytes; use enum_as_inner::EnumAsInner; use m_executor::FnExeCtxSyncAllowedType; +use parking_lot::Mutex; use serde::{de::Error, Deserialize, Deserializer, Serialize}; +use std::path::PathBuf; +use std::time::Duration; use std::{ borrow::Borrow, collections::{BTreeMap, HashMap}, @@ -297,7 +299,7 @@ pub enum AppType { Native, } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Debug)] pub struct AppMeta { pub app_type: AppType, pub fns: HashMap, @@ -631,6 +633,8 @@ pub struct AppMetaManager { view: View, pub native_apps: HashMap, // app_meta_list_lock: Mutex<()>, + #[cfg(test)] + pub test_http_app_uploaded: Mutex, } #[async_trait] @@ -656,7 +660,8 @@ impl LogicalModule for AppMetaManager { view, fs_layer, native_apps: native_apps(), - // app_meta_list_lock: Mutex::new(()), + #[cfg(test)] + test_http_app_uploaded: Mutex::new(Bytes::new()), // app_meta_list_lock: Mutex::new(()), } } async fn init(&self) -> WSResult<()> { @@ -692,9 +697,9 @@ impl AppMetas { // } // } // pub async fn set_tmp_appmeta(&self, ) - fn get_tmp_app_meta(&self, app: &str) -> Option { - self.tmp_app_metas.get(app).cloned() - } + // fn get_tmp_app_meta(&self, app: &str) -> Option { + // self.tmp_app_metas.get(app).cloned() + // } pub fn get_pattern_triggers( &self, @@ -776,67 +781,67 @@ impl AppMetaManager { // TODO: Implement app loading logic Ok(()) } - async fn construct_tmp_app(&self, tmpapp: &str) -> WSResult { - // 1.meta - // let appdir = self.fs_layer.concat_app_dir(app); - let appmeta = self.fs_layer.read_app_meta(tmpapp).await?; - - // TODO: 2.check project dir - // 3. if java, take snapshot - if let AppType::Jar = appmeta.app_type { - let _ = self - .meta - .write() - .await - .tmp_app_metas - .insert(tmpapp.to_owned(), appmeta.clone()); - tracing::debug!("record app meta to make checkpoint {}", tmpapp); - self.view - .instance_manager() - .make_checkpoint_for_app(tmpapp) - .await?; - self.view - .instance_manager() - .drap_app_instances(tmpapp) - .await; - // remove app_meta - tracing::debug!("checkpoint made, remove app meta {}", tmpapp); - let _ = self - .meta - .write() - .await - .tmp_app_metas - .remove(tmpapp) - .unwrap_or_else(|| { - panic!("remove app meta failed, app: {}", tmpapp); - }); - } + // async fn construct_tmp_app(&self, tmpapp: &str) -> WSResult { + // // 1.meta + // // let appdir = self.fs_layer.concat_app_dir(app); + // let appmeta = self.fs_layer.read_app_meta(tmpapp).await?; + + // // TODO: 2.check project dir + // // 3. if java, take snapshot + // if let AppType::Jar = appmeta.app_type { + // let _ = self + // .meta + // .write() + // .await + // .tmp_app_metas + // .insert(tmpapp.to_owned(), appmeta.clone()); + // tracing::debug!("record app meta to make checkpoint {}", tmpapp); + // self.view + // .instance_manager() + // .make_checkpoint_for_app(tmpapp) + // .await?; + // self.view + // .instance_manager() + // .drap_app_instances(tmpapp) + // .await; + // // remove app_meta + // tracing::debug!("checkpoint made, remove app meta {}", tmpapp); + // let _ = self + // .meta + // .write() + // .await + // .tmp_app_metas + // .remove(tmpapp) + // .unwrap_or_else(|| { + // panic!("remove app meta failed, app: {}", tmpapp); + // }); + // } - Ok(appmeta) - } - pub async fn app_available(&self, app: &str) -> WSResult { - match self - .view - .data_general() - .get_or_del_datameta_from_master( - format!("{}{}", DATA_UID_PREFIX_APP_META, app).as_bytes(), - false, - ) - .await - { - Err(err) => match err { - WSError::WsDataError(WsDataError::DataSetNotFound { uniqueid }) => { - tracing::debug!( - "app meta not found, app: {}", - std::str::from_utf8(&*uniqueid).unwrap() - ); - Ok(false) - } - _ => Err(err), - }, - Ok(_) => Ok(true), - } - } + // Ok(appmeta) + // } + // pub async fn app_available(&self, app: &str) -> WSResult { + // match self + // .view + // .data_general() + // .get_or_del_datameta_from_master( + // format!("{}{}", DATA_UID_PREFIX_APP_META, app).as_bytes(), + // false, + // ) + // .await + // { + // Err(err) => match err { + // WSError::WsDataError(WsDataError::DataSetNotFound { uniqueid }) => { + // tracing::debug!( + // "app meta not found, app: {}", + // std::str::from_utf8(&*uniqueid).unwrap() + // ); + // Ok(false) + // } + // _ => Err(err), + // }, + // Ok(_) => Ok(true), + // } + // } /// get app by idx 1 pub async fn load_app_file(&self, app: &str, datameta: DataSetMetaV2) -> WSResult<()> { @@ -845,25 +850,56 @@ impl AppMetaManager { app, datameta ); - let mut data = match self - .view - .data_general() - .get_or_del_data(GetOrDelDataArg { - meta: Some(datameta), - unique_id: format!("{}{}", DATA_UID_PREFIX_APP_META, app).into(), - ty: GetOrDelDataArgType::PartialOne { idx: 1 }, - }) - .await - { - Err(err) => { - tracing::warn!("get app file failed, err: {:?}", err); - return Err(err); + + // 简易轮询实现,确保应用被完整上传到系统;后续增加数据ready等待能力 + let mut data: Option> = None; + for i in 0..10 { + match self + .view + .data_general() + .get_or_del_datas(GetOrDelDataArg { + meta: Some(datameta.clone()), + unique_id: format!("{}{}", DATA_UID_PREFIX_APP_META, app).into(), + ty: GetOrDelDataArgType::PartialOne { idx: 1 }, + }) + .await + { + Err(err) => { + tracing::warn!("get app file failed, err: {:?}", err); + // return Err(err); + } + Ok((_datameta, data_items)) => { + // data + if data_items.len() == 1 { + data = Some(data_items); + break; + } + tracing::warn!( + "get app file failed, data item not complete, count: {}", + data_items.len() + ); + } + }; + if i == 4 { + tracing::warn!( + "get app file failed, stop retry", + // data_items.len() + ); + } else { + tracing::warn!("get app file failed, will retry for the {} time", i); } - Ok((_datameta, data)) => data, + tokio::time::sleep(Duration::from_secs(5)).await; + } + let Some(mut data) = data else { + return Err(WsFuncError::AppPackLoadFailed { + app: app.to_owned(), + err: None, + context: "app file not found".to_owned(), + } + .into()); }; - let proto::DataItem { - data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(appfiledata)), + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(_)), } = data.remove(&1).unwrap() else { return Err(WsFuncError::InvalidAppMetaDataItem { @@ -872,28 +908,45 @@ impl AppMetaManager { .into()); }; - // extract app file - let zipfilepath = appfiledata.file_name_opt; + // check app dir exists and app.yml exists let appdir = self.fs_layer.concat_app_dir(app); - let res = tokio::task::spawn_blocking(move || { - // remove old app dir - if appdir.exists() { - fs::remove_dir_all(&appdir).unwrap(); + if !appdir.exists() { + tracing::warn!("app dir not exists, app: {}", app); + return Err(WsFuncError::AppPackLoadFailed { + app: app.to_owned(), + err: None, + context: "app dir not exists after get app data".to_owned(), } - // open zip file - let zipfile = std::fs::File::open(zipfilepath)?; - zip_extract::extract(zipfile, &appdir, false) - }) - .await - .unwrap(); - - if let Err(err) = res { - tracing::warn!("extract app file failed, err: {:?}", err); - return Err(WsFuncError::AppPackFailedZip(err).into()); + .into()); } + // extract app file + // let zipfilepath = self.view.os().file_path.join(appfiledata.file_name_opt); + // let appdir = self.fs_layer.concat_app_dir(app); + // let res = tokio::task::spawn_blocking(move || { + // // remove old app dir + // if appdir.exists() { + // fs::remove_dir_all(&appdir).unwrap(); + // } + // // open zip file + // let zipfile = std::fs::File::open(zipfilepath)?; + // zip_extract::extract(zipfile, &appdir, false) + // }) + // .await + // .unwrap(); + + // if let Err(err) = res { + // tracing::warn!("extract app file failed, err: {:?}", err); + // return Err(WsFuncError::AppPackFailedZip(err).into()); + // } + Ok(()) } + + // fn get_native_app_meta(&self, app: &str) -> WSResult> { + // Ok() + // } + /// get app meta by idx 0 /// None DataSetMetaV2 means temp app prepared /// Some DataSetMetaV2 means app from inner storage @@ -901,15 +954,18 @@ impl AppMetaManager { &self, app: &str, ) -> WSResult)>> { - if let Some(res) = self.meta.read().await.get_tmp_app_meta(app) { - return Ok(Some((res, None))); + if let Some(nativeapp) = self.native_apps.get(app).cloned() { + return Ok(Some((nativeapp, None))); } + // if let Some(res) = self.meta.read().await.get_tmp_app_meta(app) { + // return Ok(Some((res, None))); + // } // self.app_metas.get(app) tracing::debug!("calling get_or_del_data to get app meta, app: {}", app); let datameta = view() .data_general() - .get_or_del_data(GetOrDelDataArg { + .get_or_del_datas(GetOrDelDataArg { meta: None, unique_id: format!("{}{}", DATA_UID_PREFIX_APP_META, app).into(), ty: GetOrDelDataArgType::PartialOne { idx: 0 }, @@ -1002,8 +1058,9 @@ impl AppMetaManager { } }; - // 3. check meta - let res = self.construct_tmp_app(&tmpapp).await; + ///// + // check meta by tmp app dir + let res = self.fs_layer.read_app_meta(&tmpapp).await; //self.construct_tmp_app(&tmpapp).await; let appmeta = match res { Err(e) => { let _ = fs::remove_dir_all(&tmpappdir); @@ -1013,27 +1070,19 @@ impl AppMetaManager { Ok(appmeta) => appmeta, }; - // remove temp dir - // let _ = fs::remove_dir_all(&tmpappdir).map_err(|e| WSError::from(WsIoErr::Io(e)))?; - + ///// // mv temp app to formal app dir let rel_app_dir = format!("{}/{}", APPS_REL_DIR, appname); - // 修改前: let formal_app_dir = self.view.os().file_path.join(rel_app_dir); rel_app_dir是字符串类型发生所有权转移,然而在下方还被使用了,选择修改为clone 曾俊 let formal_app_dir = self.view.os().file_path.join(rel_app_dir.clone()); - //let _ = fs::rename(&tmpappdir, &formal_app_dir).map_err(|e| WSError::from(WsDataError::FileOpenErr { path: (), err: () })); - //虞光勇修改:因为在调用 fs::rename 并处理其结果时遇到了类型不匹配的问题。具体来说, - // 在构造WsDataError::FileOpenErr 时,path 字段的类型不匹配:期望的是 PathBuf 类型,但实际传入的是 ()(即单元类型)。 - //修改: - // let result = fs::rename(&tmpappdir, &formal_app_dir).map_err(|e| { - // 这里result变量下方没有再使用 加了一个标志 曾俊 - let _result = fs::rename(&tmpappdir, &formal_app_dir).map_err(|e| { + let _ = fs::rename(&tmpappdir, &formal_app_dir).map_err(|e| { WSError::from(WsDataError::FileOpenErr { path: PathBuf::from(formal_app_dir.clone()), err: e, }) - }); + })?; - // 3. broadcast meta and appfile + ///// + // write data to whole system let write_data_id = format!("{}{}", DATA_UID_PREFIX_APP_META, appname); let write_datas = vec![ DataItemArgWrapper::from_bytes(bincode::serialize(&appmeta).unwrap()), @@ -1045,15 +1094,16 @@ impl AppMetaManager { //这里的 from_file 方法返回一个 Result, // 但你直接将其赋值给一个期望 DataItemArgWrapper 类型的变量或参数,导致类型不匹配。使用 ? 操作符 //DataItemArgWrapper::from_file(rel_app_dir.into())?, - DataItemArgWrapper::from_file(rel_app_dir.into())?, + DataItemArgWrapper::from_file(self.view.copy_module_ref(), rel_app_dir.into())?, ]; tracing::debug!( "app data size: {:?}", write_datas .iter() - // 修改前:.map(|v| v.to_string()) 去掉了这一行,为结构体派生了debug特征 曾俊 + // 修改前:.map(|v| v.to_string()) 去掉了这一行,为结构体派生了debug特征 曾俊 .collect::>() ); + let task = self.view.executor().register_sub_task(); self.view .data_general() .write_data( @@ -1063,10 +1113,13 @@ impl AppMetaManager { self.view.p2p().nodes_config.this_node(), proto::DataOpeType::Write, OpeRole::UploadApp(DataOpeRoleUploadApp {}), + task.clone(), )), ) .await?; - tracing::debug!("app uploaded"); + // wait for sub task done(checkpoint) + self.view.executor().wait_for_subtasks(&task.task_id).await; + tracing::debug!("app uploaded, wait for sub task done"); Ok(()) } @@ -1082,38 +1135,38 @@ impl AppMetaManager { // .todo_handle("This part of the code needs to be implemented."); //修改后代码:对set函数的返回类型进行处理 曾俊 - match self.view - .kv_store_engine() - .set( - KeyTypeServiceList, - &serde_json::to_string(&list).unwrap().into(), - false, - ) { + match self.view.kv_store_engine().set( + KeyTypeServiceList, + &serde_json::to_string(&list).unwrap().into(), + false, + ) { Ok((version, _)) => { - tracing::debug!("App meta list updated successfully, version: {}, list: {:?}", version, list); - }, + tracing::debug!( + "App meta list updated successfully, version: {}, list: {:?}", + version, + list + ); + } Err(e) => { tracing::error!("Failed to set app meta list: {:?}", e); } + } } -} - -pub fn get_app_meta_list(&self) -> Vec { - let res = self - .view - .kv_store_engine() - .get(&KeyTypeServiceList, false, KvAdditionalConf {}) - .map(|(_version, list)| list) - .unwrap_or_else(|| { - return vec![]; - }); - serde_json::from_slice(&res).unwrap_or_else(|e| { - tracing::warn!("parse app meta list failed, err: {:?}", e); - vec![] - }) -} - + pub fn get_app_meta_list(&self) -> Vec { + let res = self + .view + .kv_store_engine() + .get(&KeyTypeServiceList, false, KvAdditionalConf {}) + .map(|(_version, list)| list) + .unwrap_or_else(|| { + return vec![]; + }); + serde_json::from_slice(&res).unwrap_or_else(|e| { + tracing::warn!("parse app meta list failed, err: {:?}", e); + vec![] + }) + } // pub fn get_app_meta_basicinfo_list(&self) -> Vec { // let apps = self.get_app_meta_list(); diff --git a/src/main/src/general/data/m_data_general/batch.rs b/src/main/src/general/data/m_data_general/batch.rs index 4600187..844612b 100644 --- a/src/main/src/general/data/m_data_general/batch.rs +++ b/src/main/src/general/data/m_data_general/batch.rs @@ -25,18 +25,21 @@ /// /// For detailed implementation of the regular data interface, see the data.rs module. use super::*; +use crate::general::data::m_data_general::batch_handler::DEFAULT_BLOCK_SIZE; +use crate::general::data::m_data_general::dataitem::DataItemSource; use crate::general::network::proto; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; -use tokio::sync::Semaphore; use std::sync::Arc; use std::time::Duration; -use crate::general::data::m_data_general::dataitem::DataItemSource; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; +use tokio::sync::Semaphore; impl proto::DataItem { pub fn size(&self) -> usize { match &self.data_item_dispatch { Some(proto::data_item::DataItemDispatch::RawBytes(bytes)) => bytes.len(), - Some(proto::data_item::DataItemDispatch::File(file_data)) => file_data.file_content.len(), + Some(proto::data_item::DataItemDispatch::File(file_data)) => { + file_data.file_content.len() + } None => 0, } } @@ -61,9 +64,7 @@ impl DataGeneral { ) -> WSResult<()> { let total_size = match data.as_ref() { DataItemSource::Memory { data } => data.len(), - DataItemSource::File { path } => { - tokio::fs::metadata(path).await?.len() as usize - } + DataItemSource::File { path } => tokio::fs::metadata(path).await?.len() as usize, }; let total_blocks = (total_size + DEFAULT_BLOCK_SIZE - 1) / DEFAULT_BLOCK_SIZE; let semaphore = Arc::new(Semaphore::new(32)); @@ -114,7 +115,8 @@ impl DataGeneral { let view = view.clone(); let handle = tokio::spawn(async move { let _permit = permit; // 持有permit直到任务完成 - let resp = view.data_general() + let resp = view + .data_general() .rpc_call_batch_data .call( view.p2p(), @@ -123,7 +125,7 @@ impl DataGeneral { Some(Duration::from_secs(30)), ) .await?; - + if !resp.success { return Err(WsDataError::BatchTransferError { request_id: proto::BatchRequestId { @@ -131,7 +133,8 @@ impl DataGeneral { sequence: block_idx as u64, // 修复:使用 u64 }, msg: resp.error_message, - }.into()); + } + .into()); } Ok(()) }); diff --git a/src/main/src/general/data/m_data_general/batch_handler.rs b/src/main/src/general/data/m_data_general/batch_handler.rs index c5420ce..30e3aea 100644 --- a/src/main/src/general/data/m_data_general/batch_handler.rs +++ b/src/main/src/general/data/m_data_general/batch_handler.rs @@ -1,19 +1,141 @@ -use crate::general::network::{ - proto::BatchDataRequest, - proto::BatchDataResponse, - m_p2p::RPCResponsor, +use super::{DataGeneral, DataSetMetaV2}; +use crate::{ + general::{ + data::m_data_general::dataitem::WriteSplitDataTaskGroup, + network::{ + m_p2p::RPCResponsor, + proto::{self, BatchDataRequest, BatchDataResponse, DataItem}, + proto_ext::{DataItemExt, ProtoExtDataItem}, + }, + }, + result::{WSError, WSResult, WSResultExt, WsDataError}, + sys::NodeID, }; -use std::sync::Arc; -use tokio::sync::Mutex; +use async_trait::async_trait; +use std::{ + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + Arc, + }, + time::Duration, +}; +use tokio::sync::{futures::Notified, oneshot, Mutex, Notify, RwLock}; use tracing; +/// 默认数据块大小 (4MB) +pub const DEFAULT_BLOCK_SIZE: usize = 4 * 1024 * 1024; + +enum BatchDoneMsg { + Done { + version: u64, + request_id: proto::BatchRequestId, // as the context index + required_result: Option, + }, + Error { + version: u64, + error_message: String, + request_id: proto::BatchRequestId, + //required_result: Option, + }, + Replaced { + version: u64, + request_id: proto::BatchRequestId, + // required_result: Option, + }, +} + +#[async_trait] +trait BatchDoneResponsor: Send { + async fn done(&self, msg: BatchDoneMsg); +} + +#[derive(Clone)] +struct BatchInProcessResponsor { + /// use option bacause maybe don't need the return in delete mode + tx: tokio::sync::mpsc::Sender>, +} + +impl BatchInProcessResponsor { + pub fn new_pair() -> (Self, tokio::sync::mpsc::Receiver>) { + let (tx, rx) = tokio::sync::mpsc::channel(1); + (Self { tx }, rx) + } +} + +#[async_trait] +impl BatchDoneResponsor for BatchInProcessResponsor { + async fn done(&self, msg: BatchDoneMsg) { + match msg { + BatchDoneMsg::Done { + required_result, .. + } => self.tx.send(required_result).await.unwrap(), + BatchDoneMsg::Error { + request_id, + error_message, + .. + } => { + // drop the channel, so the receiver will know the error + panic!("batch one recev {:?} error: {}", request_id, error_message); + } + BatchDoneMsg::Replaced { .. } => {} + } + // self.tx.send(()).await.unwrap(); + } +} + +#[async_trait] +impl BatchDoneResponsor for RPCResponsor { + async fn done(&self, msg: BatchDoneMsg) { + let (request_id, success, error_message, version) = match msg { + BatchDoneMsg::Done { + request_id, + version, + .. + } => (request_id, true, String::new(), version), + BatchDoneMsg::Error { + error_message, + request_id, + version, + .. + } => (request_id, false, error_message, version), + BatchDoneMsg::Replaced { + request_id, + version, + .. + } => (request_id, true, String::new(), version), + }; + let _ = self + .send_resp(BatchDataResponse { + request_id: Some(request_id), + version, + success, + error_message, + }) + .await + .todo_handle("send back batch data response"); + } + + // async fn replaced(&self, request_id: proto::BatchRequestId, version: u64) { + // if let Err(e) = self + // .send_resp(BatchDataResponse { + // request_id: Some(request_id), // 这里需要正确的 request_id + // version, // 这里需要正确的版本号 + // success: true, + // error_message: String::new(), + // }) + // .await + // { + // tracing::error!("Failed to respond to old request: {}", e); + // } + // } +} /// 共享状态,用于记录最新的请求响应器 /// 当收到新的请求时,会更新响应器并自动处理旧的请求 #[derive(Clone)] pub struct SharedWithBatchHandler { /// 当前活跃的响应器 /// 使用 Arc 保证线程安全 - responsor: Arc>>>, + responsor: Arc>>>, //RPCResponsor> } impl SharedWithBatchHandler { @@ -27,28 +149,31 @@ impl SharedWithBatchHandler { /// 更新响应器 /// 如果存在旧的响应器,会自动返回成功 - /// + /// /// # 参数 /// * `responsor` - 新的响应器 - pub async fn update_responsor(&self, responsor: RPCResponsor) { + pub async fn update_responsor( + &self, + request_id: proto::BatchRequestId, + version: u64, + responsor: Box, + ) { let mut guard = self.responsor.lock().await; if let Some(old_responsor) = guard.take() { // 旧的responsor直接返回成功 - if let Err(e) = old_responsor.send_resp(BatchDataResponse { - request_id: None, // 这里需要正确的 request_id - version: 0, // 这里需要正确的版本号 - success: true, - error_message: String::new(), - }).await { - tracing::error!("Failed to respond to old request: {}", e); - } + old_responsor + .done(BatchDoneMsg::Replaced { + version, + request_id, + }) + .await; } *guard = Some(responsor); } /// 获取最终的响应器 /// 用于在所有数据都写入完成后发送最终响应 - pub async fn get_final_responsor(&self) -> Option> { + pub async fn get_final_responsor(&self) -> Option> { self.responsor.lock().await.take() } } @@ -57,21 +182,430 @@ impl SharedWithBatchHandler { /// 用于管理单个批量数据传输请求的生命周期 pub struct BatchReceiveState { /// 写入任务句柄 - pub handle: super::dataitem::WriteSplitDataTaskHandle, + pub handle: RwLock>, /// 共享状态,用于处理请求响应 pub shared: SharedWithBatchHandler, + /// version, same as handle.version() + pub version: u64, } impl BatchReceiveState { /// 创建新的批量数据传输状态 - /// + /// /// # 参数 /// * `handle` - 写入任务句柄 /// * `shared` - 共享状态 - pub fn new(handle: super::dataitem::WriteSplitDataTaskHandle, shared: SharedWithBatchHandler) -> Self { + pub fn new( + handle: super::dataitem::WriteSplitDataTaskHandle, + shared: SharedWithBatchHandler, + ) -> Self { Self { - handle, + version: handle.version(), + handle: RwLock::new(Some(handle)), shared, } } } + +#[derive(Clone)] +pub enum GetOrDelType { + Get, + DelReturnData, + DelReturnNoData, +} + +impl GetOrDelType { + pub fn return_data(&self) -> bool { + matches!(self, GetOrDelType::Get | GetOrDelType::DelReturnData) + } + pub fn delete(&self) -> bool { + matches!( + self, + GetOrDelType::DelReturnData | GetOrDelType::DelReturnNoData + ) + } +} +// trait BatchRecvNotifier {} + +impl DataGeneral { + // 处理批量数据写入请求 + pub async fn rpc_handle_batch_data( + &self, + responsor: RPCResponsor, + req: proto::BatchDataRequest, + ) -> WSResult<()> { + tracing::debug!( + "rpc_handle_batch_data with batchid({:?})", + req.request_id.clone().unwrap() + ); + // 预先克隆闭包外需要的字段 + // let block_index = req.block_index; + // let data = req.data.clone(); + // let request_id = req.request_id.clone().unwrap(); + + self.handle_batch_data_one( + req.unique_id, + req.request_id.unwrap(), + req.total_size as usize, + req.block_type.unwrap(), + req.version, + req.block_index as usize, + Box::new(responsor), + req.data_item_idx as u8, + ) + .await?; + + Ok(()) + } + + pub async fn handle_batch_data_one( + &self, + unique_id: Vec, + request_id: proto::BatchRequestId, + total_size: usize, + partial_block: proto::DataItem, + version: u64, + block_index: usize, + responsor: Box, + _item_idx: u8, + ) -> WSResult<()> { + // 1. 查找或创建状态 + let view = self.view.clone(); + let data_type = partial_block.get_data_type(); + let request_id_init = request_id.clone(); + let state = match self + .batch_receive_states + .get_or_init(request_id.clone(), async move { + // 创建任务组和句柄 + let (mut group, handle) = match WriteSplitDataTaskGroup::new( + &view, + unique_id.clone(), + total_size, + data_type, + version, + ) + .await + { + Ok((group, handle)) => (group, handle), + Err(e) => { + tracing::error!("Failed to create task group: {:?}", e); + return Err(e); + } + }; + + // // // 启动process_tasks + // let _ = tokio::spawn(async move { + + // }); + + let state = Arc::new(BatchReceiveState::new( + handle, + SharedWithBatchHandler::new(), + )); + let state_clone = state.clone(); + + // response task + let _ = tokio::spawn(async move { + tracing::debug!("rpc_handle_batch_data response task started"); + let resdata = match group.process_tasks().await { + Ok(item) => item, + Err(e) => { + panic!("Failed to process tasks: {}", e); + } + }; + // 等待所有任务完成 + // let resdata = match waiter.await { + // Ok(data) => { + // tracing::debug!( + // "rpc_handle_batch_data response task wait all tasks done" + // ); + // data + // } + // Err(e) => { + // tracing::error!("Failed to wait for tasks: {}", e); + // todo!("use responsor to send error response"); + // return; + // } + // }; + + tracing::debug!("rpc_handle_batch_data response task wait all tasks done"); + + // 发送最终响应 + if let Some(final_responsor) = state_clone.shared.get_final_responsor().await { + // if let Err(e) = + final_responsor + .done(BatchDoneMsg::Done { + request_id: request_id_init.clone(), + version: state_clone.version, + required_result: Some(resdata), + }) + .await; + // { + // tracing::error!("Failed to send final response: {}", e); + // } + } + + // 清理状态 + let _ = view + .data_general() + .batch_receive_states + .remove(&request_id_init); + }); + + Ok(state) + }) + .await + { + Err(e) => { + return Err(WSError::WsDataError(WsDataError::BatchTransferError { + request_id: request_id.clone(), + msg: format!("Failed to initialize batch state: {}", e), + })) + } + Ok(state) => state, + }; + + tracing::debug!("rpc_handle_batch_data ready with write_split_data_task_group"); + + // 2. 提交分片数据 + // let data_item = proto::DataItem { + // data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(data)), + // ..Default::default() + // }; + + let bytes = partial_block.into_data_bytes(); + tracing::debug!( + "submit_split with data split idx: {}, at node: {}, partial {:?}", + block_index, + self.view.p2p().nodes_config.this_node(), + &bytes[0..30] + ); + let keepon = { + let handle_read = state.handle.read().await; + let Some(handle) = handle_read.as_ref() else { + return Err(WSError::WsDataError(WsDataError::DataSplitTaskError { + msg: format!("Failed to submit task: submit_split count to the end"), + })); + }; + handle + .submit_split(block_index as usize * DEFAULT_BLOCK_SIZE, bytes) + .await? + }; + + if !keepon { + // remove state.handle, make the mem count to one + let _ = state.handle.write().await.take(); + } + + // 3. 更新响应器 + state + .shared + .update_responsor(request_id, state.version, responsor) + .await; + + Ok(()) + } + + fn next_batch_id(&self, nodeid: NodeID) -> proto::BatchRequestId { + static NEXT_BATCH_ID: AtomicU64 = AtomicU64::new(1); // 从1开始,保留0作为特殊值 + proto::BatchRequestId { + node_id: nodeid, + sequence: NEXT_BATCH_ID.fetch_add(1, Ordering::Relaxed), + } + } + + pub async fn batch_get_or_del_data( + &self, + unique_id: Vec, + dataset_meta: &DataSetMetaV2, + idxs: &Vec, + opetype: GetOrDelType, + ) -> WSResult>> { + let mut waiters = Vec::new(); + for &idx in idxs { + // check cache first + + // allocate a batch request id + let splits = &dataset_meta.datas_splits[idx as usize]; + let request_id = self.next_batch_id(self.view.p2p().nodes_config.this_node()); + let total_size = splits.total_size(); + let (responsor, waiter) = BatchInProcessResponsor::new_pair(); + waiters.push(waiter); + if let Some(fp_) = &dataset_meta.filepath[idx as usize] { + // try check the target file + let fp = self.view.os().file_path.join(fp_); + if fp.exists() { + responsor + .done(BatchDoneMsg::Done { + version: dataset_meta.version, + request_id: request_id.clone(), + required_result: Some(DataItem::new_file_data(&*fp_, fp.is_dir())), + }) + .await; + tracing::debug!( + "access cached file uid({:?}) idx({}) path({:?}) success", + &unique_id, + idx, + fp + ); + continue; + } else { + tracing::debug!( + "access cached file uid({:?}) idx({}) path({:?}) failed, will get from remote", + &unique_id, + idx, + fp + ); + } + // if fp.is_dir() { + // responsor + // .done(BatchDoneMsg::Done { + // version: dataset_meta.version, + // request_id: request_id, + // required_result: Some(DataItem::new_file_data(&*fp_, true)), + // }) + // .await; + // continue; + // } else { + } else { + // try check the cache + let cache_data = self.cache_in_memory.get(&unique_id); + if let Some(cache_data) = cache_data { + responsor + .done(BatchDoneMsg::Done { + version: dataset_meta.version, + request_id: request_id.clone(), + required_result: Some(DataItem::new_mem_data(cache_data)), + }) + .await; + tracing::debug!( + "access cached mem uid({:?}) idx({}) success", + &unique_id, + idx + ); + continue; + } else { + tracing::debug!( + "access cached mem uid({:?}) idx({}) failed, will get from remote", + &unique_id, + idx + ); + } + } + + tracing::debug!( + "batch_get_or_del_data with receving uid({}) data idx({}) with length({})", + unique_id.len(), + idx, + total_size + ); + + // 发起多个batch read, 拿到返回结果后并行调用 rpc_handle_batch_data + for (split_idx, split) in splits.splits.iter().enumerate() { + // get one data request + let view = self.view.clone(); + let unique_id = unique_id.clone(); + let request_id = request_id.clone(); + let opetype = opetype.clone(); + let responsor = responsor.clone(); + let version = dataset_meta.version; + let node_id = split.node_id as NodeID; + // let data_offset = split.data_offset; + let _ = tokio::spawn(async move { + // first read the partial block from target node + let mut partial_block = view + .data_general() + .rpc_call_get_data + .call( + view.p2p(), + node_id, + proto::GetOneDataRequest { + unique_id: unique_id.clone(), + idxs: vec![idx as u32], + delete: opetype.delete(), + return_data: opetype.return_data(), + }, + Some(Duration::from_secs(60)), + ) + .await + .unwrap_or_else(|err| { + panic!("batch one recev {:?} error: {}", request_id, err); + }); + + if partial_block.data.len() != 1 { + tracing::warn!( + "batch one recev partial_block wrong count, idx({}), count({})", + idx, + partial_block.data.len() + ); + responsor + .done(BatchDoneMsg::Error { + version: version, + error_message: format!( + "batch one recev partial_block wrong count, idx({}), count({})", + idx, + partial_block.data.len() + ), + request_id: request_id.clone(), + // required_result: None, + }) + .await; + return; + } + + tracing::debug!( + "batch one recev partial_block, idx({}), type({:?}), size({})", + idx, + partial_block.data[0].get_data_type(), + partial_block.data[0].inmem_size() + ); + + if !partial_block.success { + tracing::error!( + "batch one recev {:?} error: {}", + request_id, + partial_block.message + ); + } + + if opetype.return_data() { + view.data_general() + .handle_batch_data_one( + unique_id.clone(), + request_id.clone(), + total_size, + partial_block.data.pop().unwrap(), + version, + split_idx, + Box::new(responsor), + idx, + ) + .await + .unwrap(); + } + }); + } + } + + let res = if opetype.return_data() { + let mut results = Vec::new(); + for (i, waiter) in waiters.iter_mut().enumerate() { + let Some(res) = waiter.recv().await else { + tracing::error!( + "batch one recev error, uid({:?}), idx({})", + unique_id, + idxs[i] + ); + return Err(WSError::WsDataError(WsDataError::DataSplitTaskError { + msg: format!("Failed to submit task: submit_split count to the end"), + })); + }; + results.push(res.unwrap()); + } + Some(results) + } else { + None + }; + Ok(res) + } +} diff --git a/src/main/src/general/data/m_data_general/dataitem.rs b/src/main/src/general/data/m_data_general/dataitem.rs index fd2c083..2887ca4 100644 --- a/src/main/src/general/data/m_data_general/dataitem.rs +++ b/src/main/src/general/data/m_data_general/dataitem.rs @@ -1,31 +1,35 @@ use crate::general::data::m_data_general::UniqueId; -use crate::LogicalModulesRef;//虞光勇修改,修改内容:增加use crate::LogicalModulesRef;来导入 LogicalModulesRef。 -use ::zip::CompressionMethod;//虞光勇修改,因为编译器无法找到 zip 模块中的 CompressionMethod,需加入头文件(860续) +use crate::general::data::m_data_general::{DataItemIdx, DataSplitIdx, GetOrDelDataArgType}; use crate::general::m_os::OperatingSystem; use crate::general::network::proto; -use crate::general::data::m_data_general::{DataItemIdx, DataSplitIdx, GetOrDelDataArgType}; use crate::general::network::proto_ext::{NewPartialFileDataArg, ProtoExtDataItem}; use crate::logical_module_view_impl; -use crate::modules_global_bridge::try_get_modules_ref; use crate::result::{WSError, WSResult, WSResultExt, WsDataError}; use crate::util::zip; +use crate::LogicalModulesRef; +use parking_lot::Mutex; +//虞光勇修改,修改内容:增加use crate::LogicalModulesRef;来导入 LogicalModulesRef。 +use ::zip::CompressionMethod; //虞光勇修改,因为编译器无法找到 zip 模块中的 CompressionMethod,需加入头文件(860续) +use base64::{engine::general_purpose::STANDARD, Engine as _}; use futures::stream::{FuturesUnordered, StreamExt}; use std::cell::RefCell; use std::collections::btree_set; +use std::io::Read; use std::ops::Range; use std::path::PathBuf; use std::str::FromStr; +use std::sync::atomic::AtomicUsize; use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; use tokio::sync::mpsc; -use tokio::sync::broadcast; +use tokio::sync::{broadcast, oneshot}; use tracing; -use base64::{engine::general_purpose::STANDARD, Engine as _}; -use std::sync::RwLock; const DEFAULT_BLOCK_SIZE: usize = 4096; -logical_module_view_impl!(DataItemView); -logical_module_view_impl!(DataItemView,os,OperatingSystem); +// logical_module_view_impl!(DataItemView); +// logical_module_view_impl!(DataItemView, os, OperatingSystem); /// 用于遍历数据项索引的迭代器 #[derive(Debug)] @@ -35,10 +39,7 @@ pub(super) enum WantIdxIter<'a> { iter: btree_set::Iter<'a, DataItemIdx>, }, /// 遍历单个索引 - PartialOne { - idx: DataItemIdx, - itercnt: u8, - }, + PartialOne { idx: DataItemIdx, itercnt: u8 }, /// 遍历所有或删除操作的索引 Other { ty: GetOrDelDataArgType, @@ -49,7 +50,7 @@ pub(super) enum WantIdxIter<'a> { impl<'a> WantIdxIter<'a> { /// 创建新的索引迭代器 - /// + /// /// # 参数 /// * `ty` - 迭代类型 /// * `itemcnt` - 数据项总数 @@ -62,7 +63,7 @@ impl<'a> WantIdxIter<'a> { itercnt: 0, len: itemcnt, }, - GetOrDelDataArgType::PartialOne { idx } => Self::PartialOne { + GetOrDelDataArgType::PartialOne { idx } => Self::PartialOne { idx: *idx, itercnt: 0, }, @@ -122,6 +123,12 @@ impl SharedMemHolder { // 3. This is safe as long as this is the only reference to the Arc // unsafe { // let ptr = &self.data as *const Arc> as *mut Arc>; + tracing::debug!( + "try_take_data, Arc::strong_count: {}, Arc::weak_count: {}", + Arc::strong_count(&self.data), + Arc::weak_count(&self.data) + ); + if Arc::strong_count(&self.data) == 1 { Some(Arc::try_unwrap(self.data).unwrap()) } else { @@ -136,7 +143,10 @@ impl SharedMemHolder { impl From for Vec { fn from(holder: SharedMemHolder) -> Self { - holder.as_raw_bytes().expect("Failed to get raw bytes").to_vec() + holder + .as_raw_bytes() + .expect("Failed to get raw bytes") + .to_vec() } } @@ -151,7 +161,7 @@ pub struct SharedMemOwnedAccess { impl SharedMemOwnedAccess { /// 获取可变字节切片 - /// + /// /// # Safety /// 调用者必须确保: /// 1. 没有其他线程同时访问这块内存 @@ -168,7 +178,7 @@ impl SharedMemOwnedAccess { } /// 创建新的共享内存和访问者 -/// +/// /// # 参数 /// * `splits` - 内存分片范围列表 #[must_use] @@ -187,10 +197,10 @@ pub fn new_shared_mem(splits: &[Range]) -> (SharedMemHolder, Vec>` - 分片范围列表 #[must_use] @@ -209,7 +219,7 @@ pub fn calculate_splits(total_size: usize) -> Vec> { /// 支持写入文件或内存两种模式 #[derive(Debug, Clone)] pub enum WriteSplitDataType { - Dir{ + Dir { /// 接受的压缩文件形式 zip_file: PathBuf, /// 解压后的文件路径 @@ -223,6 +233,7 @@ pub enum WriteSplitDataType { /// 内存写入模式 Mem { /// 共享内存区域 + /// use option to drop before notify shared_mem: SharedMemHolder, }, } @@ -263,14 +274,13 @@ pub enum WriteSplitDataTaskGroup { /// 任务唯一标识 unique_id: UniqueId, - // /// 共享内存区域 // shared_mem: RefCell>>, /// 费新文修改,修改内容:shared_mem: RefCell>>, /// 修改原因:shared_mem: RefCell>>, 需要修改为 RefCell>, /// 修改后:shared_mem: RefCell>, /// 共享内存区域 - /// + /// // shared_mem: RefCell>, 修改为RwLock>, 曾俊 shared_mem: RwLock>, @@ -290,88 +300,99 @@ pub enum WriteSplitDataTaskGroup { impl WriteSplitDataTaskGroup { /// 创建新的任务组 pub async fn new( + view: &DataGeneralView, unique_id: UniqueId, total_size: usize, block_type: proto::data_item::DataItemDispatch, version: u64, // file_name: Option<&str>, 函数体并没有用到这个参数 查看引用发现也没有使用到这个参数 这里直接删除 曾俊 - ) -> WSResult<(Self, WriteSplitDataTaskHandle)> { + ) -> WSResult<( + Self, + WriteSplitDataTaskHandle, + // oneshot::Receiver>, // send back the recevied data + )> { + // let (alldone_tx, alldone_rx) = oneshot::channel(); let (tx, rx) = mpsc::channel(32); let (broadcast_tx, _) = broadcast::channel::<()>(32); let broadcast_tx = Arc::new(broadcast_tx); // let pathbase=DataItemView::new(try_get_modules_ref().todo_handle("Failed to get modules ref when create WriteSplitDataTaskGroup")?).os().file_path; //所有权发生变化 添加克隆方法 曾俊 - let pathbase=DataItemView::new(try_get_modules_ref().todo_handle("Failed to get modules ref when create WriteSplitDataTaskGroup")?).os().file_path.clone(); + let pathbase = view.os().file_path.clone(); match block_type { proto::data_item::DataItemDispatch::File(file_data) => { - let tmp_file_path = pathbase.join(format!("{}.data", - STANDARD.encode(&unique_id))); - + let tmp_file_path = pathbase + .join(DATA_TMP_DIR) + .join(format!("{}.data", STANDARD.encode(&unique_id))); + let handle = WriteSplitDataTaskHandle { tx, write_type: WriteSplitDataType::File { path: tmp_file_path.clone(), }, version, - broadcast_tx: broadcast_tx.clone(), + totalsize: total_size, + submited_size: Arc::new(AtomicUsize::new(0)), + // takeonce_alldone_tx: Arc::new(Mutex::new(Some(alldone_tx))), }; - + let group = Self::ToFile { is_dir: file_data.is_dir_opt, unique_id, tmp_file_path, - target_file_path: pathbase.join(file_data.file_name_opt.as_str()), + target_file_path: pathbase.join(file_data.file_name_opt.as_str()), tasks: Vec::new(), rx, expected_size: total_size, current_size: 0, broadcast_tx: broadcast_tx.clone(), }; - + Ok((group, handle)) } proto::data_item::DataItemDispatch::RawBytes(_) => { let shared_mem = SharedMemHolder { data: Arc::new(vec![0; total_size]), }; - + let handle = WriteSplitDataTaskHandle { tx, write_type: WriteSplitDataType::Mem { shared_mem: shared_mem.clone(), }, version, - broadcast_tx: broadcast_tx.clone(), + totalsize: total_size, + submited_size: Arc::new(AtomicUsize::new(0)), + // takeonce_alldone_tx: Arc::new(Mutex::new(Some(alldone_tx))), }; - + let group = Self::ToMem { unique_id, // 原代码:shared_mem, 类型不匹配 曾俊 - shared_mem:RwLock::new(Some(shared_mem)), + shared_mem: RwLock::new(Some(shared_mem)), tasks: Vec::new(), rx, expected_size: total_size, current_size: 0, broadcast_tx: broadcast_tx.clone(), }; - + Ok((group, handle)) } } } /// 处理所有写入任务 - /// + /// /// # 返回 /// * `Ok(item)` - 所有数据写入完成,返回数据项 /// * `Err(e)` - 写入过程中出错 pub async fn process_tasks(&mut self) -> WSResult { - let mut pending_tasks: FuturesUnordered> = FuturesUnordered::new(); - + let mut pending_tasks: FuturesUnordered> = + FuturesUnordered::new(); + match self { - Self::ToFile { tasks, .. } | - Self::ToMem { tasks, .. } => { + Self::ToFile { tasks, .. } | Self::ToMem { tasks, .. } => { for task in tasks.drain(..) { pending_tasks.push(task); } @@ -380,9 +401,13 @@ impl WriteSplitDataTaskGroup { loop { // 1. 检查完成状态 - match self.try_complete().await.todo_handle("Failed to complete write split data tasks")? { + match self + .try_complete() + .await + .todo_handle("Failed to complete write split data tasks")? + { Some(item) => return Ok(item), - None => {} // 继续等待 + None => {} // 继续等待 } // 2. 等待新任务或已有任务完成 @@ -416,48 +441,106 @@ impl WriteSplitDataTaskGroup { } /// 检查写入完成状态 - /// + /// /// 返回: /// - Ok(Some(item)) - 写入完成,返回数据项 /// - Ok(None) - 写入未完成 /// - Err(e) - 写入出错 async fn try_complete(&self) -> WSResult> { match self { - Self::ToFile { current_size, expected_size, tmp_file_path, target_file_path, unique_id, is_dir, .. } => { + Self::ToFile { + current_size, + expected_size, + tmp_file_path, + target_file_path, + unique_id, + is_dir, + .. + } => { if *current_size > *expected_size { Err(WSError::WsDataError(WsDataError::BatchTransferError { request_id: proto::BatchRequestId { - node_id: 0, // 这里需要传入正确的node_id + node_id: 0, // 这里需要传入正确的node_id sequence: 0, }, - msg: format!("Written size {} exceeds expected size {} for unique_id {:?}", - current_size, expected_size, unique_id) + msg: format!( + "Written size {} exceeds expected size {} for unique_id {:?}", + current_size, expected_size, unique_id + ), })) } else if *current_size == *expected_size { - if *is_dir{ + if *is_dir { // unzip to file_path // - open received file with std api - let file=std::fs::File::open(tmp_file_path).map_err(|e|{ + let file = std::fs::File::open(tmp_file_path).map_err(|e| { tracing::error!("Failed to open file: {}", e); WSError::from(WsDataError::FileOpenErr { path: tmp_file_path.clone(), err: e, }) })?; - let tmp_file_path=tmp_file_path.clone(); - let target_file_path=target_file_path.clone(); - tokio::task::spawn_blocking(move || - zip_extract::extract(file,target_file_path.as_path() , false).map_err(|e|{ - WSError::from(WsDataError::UnzipErr { - path: tmp_file_path, - err: e, - }) - }) - ).await.unwrap().todo_handle("Failed to unzip file")?; - }else{ + // let tmp_file_path = tmp_file_path.clone(); + let target_file_path = target_file_path.clone(); + tokio::task::spawn_blocking(move || { + zip_extract::extract(file, target_file_path.as_path(), false).map_err( + |e| { + // open and read data + // let mut file = std::fs::File::open(tmp_file_path.clone()) + // .map_err(|e| { + // tracing::error!("Failed to open file: {}", e); + // WSError::from(WsDataError::FileOpenErr { + // path: tmp_file_path.clone(), + // err: e, + // }) + // }) + // .unwrap(); + // let mut data = Vec::new(); + // let read_size = file + // .read_to_end(&mut data) + // .map_err(|e| { + // tracing::error!("Failed to read file: {}", e); + // WSError::from(WsDataError::FileReadErr { + // path: tmp_file_path.clone(), + // err: e, + // }) + // }) + // .unwrap(); + // tracing::debug!( + // "zip file data size: {:?} {}", + // data.len(), + // read_size + // ); + // deb + WSError::from(WsDataError::UnzipErr { + path: target_file_path, + err: e, + }) + }, + ) + }) + .await + .unwrap() + .todo_handle("Failed to unzip file")?; + // .map_err(|err| { + + // })?; + } else { // rename tmp_file_path to target_file_path - std::fs::rename(tmp_file_path, target_file_path).map_err(|e|{ - tracing::error!("Failed to rename file: {}", e); + let target_dir = target_file_path.parent().unwrap(); + std::fs::create_dir_all(target_dir).map_err(|e| { + tracing::error!("Failed to create target directory: {}", e); + WSError::from(WsDataError::FileCreateErr { + path: target_dir.to_path_buf(), + err: e, + }) + })?; + std::fs::rename(tmp_file_path, target_file_path).map_err(|e| { + tracing::error!( + "Failed to rename file from {:?} to {:?}, error: {}", + tmp_file_path, + target_file_path, + e + ); WSError::from(WsDataError::FileRenameErr { from: tmp_file_path.clone(), to: target_file_path.clone(), @@ -465,32 +548,51 @@ impl WriteSplitDataTaskGroup { }) })?; } - Ok(Some(proto::DataItem{ - data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(proto::FileData{ - file_name_opt: target_file_path.to_string_lossy().to_string(), - is_dir_opt: *is_dir, - file_content: vec![], - })), + Ok(Some(proto::DataItem { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File( + proto::FileData { + file_name_opt: target_file_path.to_string_lossy().to_string(), + is_dir_opt: *is_dir, + file_content: vec![], + }, + )), })) } else { Ok(None) } } - Self::ToMem { current_size, expected_size, shared_mem, unique_id, .. } => { + Self::ToMem { + current_size, + expected_size, + shared_mem, + unique_id, + .. + } => { if *current_size > *expected_size { Err(WSError::WsDataError(WsDataError::BatchTransferError { request_id: proto::BatchRequestId { - node_id: 0, // 这里需要传入正确的node_id + node_id: 0, // 这里需要传入正确的node_id sequence: 0, }, - msg: format!("Written size {} exceeds expected size {} for unique_id {:?}", - current_size, expected_size, unique_id) + msg: format!( + "Written size {} exceeds expected size {} for unique_id {:?}", + current_size, expected_size, unique_id + ), })) } else if *current_size == *expected_size { - Ok(Some(proto::DataItem{ + tracing::debug!("size reached {}, taking mem data", current_size); + Ok(Some(proto::DataItem { //曾俊 随RwLock数据类型改动 // data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(shared_mem.borrow_mut().take().unwrap().try_take_data().expect("only group can take data once"))), - data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(shared_mem.write().expect("Failed to lock RwLock for writing").take().unwrap().try_take_data().expect("only group can take data once"))), + data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes( + shared_mem + .write() + .expect("Failed to lock RwLock for writing") + .take() + .unwrap() + .try_take_data() + .expect("only group can take data once"), + )), })) } else { Ok(None) @@ -512,28 +614,37 @@ impl WriteSplitDataWaiter { while let Ok(_) = self.rx.recv().await { // 不需要处理具体消息内容,只需要知道有消息到达 } - + // 通道关闭表示所有发送端都已释放 Ok(()) } } - /// 写入分片任务的句柄 /// 用于提交新的分片任务和等待任务完成 #[derive(Clone)] pub struct WriteSplitDataTaskHandle { /// 发送任务的通道 tx: mpsc::Sender>, + /// 写入类型(文件或内存) write_type: WriteSplitDataType, + /// 数据版本号 /// 用于防止数据覆盖和保证数据一致性: /// 1. 防止旧版本数据覆盖新版本数据 /// 2. 客户端可以通过比较版本号确认数据是否最新 version: u64, + /// 广播通道发送端,用于通知任务完成 - broadcast_tx: Arc>, + // broadcast_tx: Arc>, + // takeonce_alldone_tx: Arc>>>>, + + /// total size + totalsize: usize, + + /// submited size + submited_size: Arc, } impl WriteSplitDataTaskHandle { @@ -542,53 +653,92 @@ impl WriteSplitDataTaskHandle { self.version } - pub fn get_all_tasks_waiter(&self) -> WriteSplitDataWaiter { - WriteSplitDataWaiter { - rx: self.broadcast_tx.subscribe(), - } - } - /// 提交新的分片任务 - /// + /// /// # 参数 /// * `idx` - 分片索引,表示数据在整体中的偏移位置 /// * `data` - 分片数据 - /// + /// /// # 返回 - /// * `Ok(())` - 任务提交成功 + /// * `Ok(true)` - 还未接受完全 + /// * `Ok(false)` - 已接受完全 /// * `Err(e)` - 任务提交失败,可能是通道已关闭 - pub async fn submit_split(&self, idx: DataSplitIdx, data: proto::DataItem) -> WSResult<()> { - let task = match &self.write_type { + pub async fn submit_split(&self, idx: DataSplitIdx, data: Vec) -> WSResult { + let (task, submited_size) = match &self.write_type { // WriteSplitDataType::File { path } | WriteSplitDataType::Dir { path } => { 原WriteSplitDataType::Dir忽视了zip_file字段 发现没有用到修改为直接忽视 曾俊 - WriteSplitDataType::File { path } | WriteSplitDataType::Dir { path ,..} => { + WriteSplitDataType::File { path } | WriteSplitDataType::Dir { path, .. } => { let path = path.clone(); let offset = idx; - let data = data.as_raw_bytes().unwrap_or(&[]).to_vec(); let written_size = data.len(); - tokio::spawn(async move { + let submited_size = self + .submited_size + .fetch_add(written_size, std::sync::atomic::Ordering::Relaxed) + + written_size; + tracing::debug!( + "submit_split: after submit len:{}, target len:{}", + submited_size, + self.totalsize, + ); + let task = tokio::spawn(async move { let result = tokio::fs::OpenOptions::new() .create(true) .write(true) .open(&path) .await; - + match result { Ok(mut file) => { + tracing::debug!( + "write_split len:{} offset:{} path:{:?}", + data.len(), + offset, + path, + ); use tokio::io::{AsyncSeekExt, AsyncWriteExt}; if let Err(e) = async move { // 验证seek结果 - let seek_pos = file.seek(std::io::SeekFrom::Start(offset as u64)).await?; + let seek_pos = + file.seek(std::io::SeekFrom::Start(offset as u64)).await?; if seek_pos != offset as u64 { return Err(std::io::Error::new( std::io::ErrorKind::Other, - format!("Seek position mismatch: expected {}, got {}", offset, seek_pos) + format!( + "Seek position mismatch: expected {}, got {}", + offset, seek_pos + ), )); } // write_all保证写入所有数据或返回错误 file.write_all(&data).await?; + // file.sync_data().await?; + file.flush().await?; + // check file size + #[cfg(test)] + { + let metadata = tokio::fs::metadata(&path).await?; + assert!( + metadata.len() as usize >= offset + data.len(), + "file size mismatch, expected {}, got {}", + offset + data.len(), + metadata.len() + ); + + tracing::debug!( + "write file data at offset {} success with size {}", + offset, + metadata.len() + ); + } + Ok::<_, std::io::Error>(()) - }.await { - tracing::error!("Failed to write file data at offset {}: {}", offset, e); + } + .await + { + tracing::error!( + "Failed to write file data at offset {}: {}", + offset, + e + ); panic!("Failed to write file: {}", e); } WriteSplitTaskResult { written_size } @@ -598,80 +748,86 @@ impl WriteSplitDataTaskHandle { panic!("Failed to open file: {}", e); } } - }) + }); + (task, submited_size) } WriteSplitDataType::Mem { shared_mem } => { let mem = shared_mem.clone(); let offset = idx; - let Some(data) = data.as_raw_bytes().map(|data| data.to_vec()) else { - return Err(WSError::WsDataError(WsDataError::BatchTransferFailed { - request_id: proto::BatchRequestId { - node_id: 0, - sequence: 0, - }, - reason: format!("mem data expected"), - })); - }; let written_size = data.len(); - tracing::debug!("submit_split: Mem, len:{}, target len:{}", data.len(), shared_mem.len()); - - tokio::spawn(async move { - unsafe { - let slice = std::slice::from_raw_parts_mut( - mem.data.as_ptr() as *mut u8, - mem.data.len() - ); - slice[offset..offset + data.len()].copy_from_slice(&data); - } - WriteSplitTaskResult { written_size } - }) + let submited_size = self + .submited_size + .fetch_add(written_size, std::sync::atomic::Ordering::Relaxed) + + written_size; + + tracing::debug!( + "submit_split: Mem, len:{}, target len:{}", + data.len(), + shared_mem.len() + ); + + ( + tokio::spawn(async move { + unsafe { + let slice = std::slice::from_raw_parts_mut( + mem.data.as_ptr() as *mut u8, + mem.data.len(), + ); + slice[offset..offset + data.len()].copy_from_slice(&data); + } + WriteSplitTaskResult { written_size } + }), + submited_size, + ) } }; // 发送到通道 - let _ = self.broadcast_tx.send(()); - self.tx.send(task).await.map_err(|e| { - tracing::error!("Failed to submit task: channel closed, idx: {:?}, error: {}", idx, e); + // let _ = self.send(()); + let _ = self.tx.send(task).await.map_err(|e| { + tracing::error!( + "Failed to submit task: channel closed, idx: {:?}, error: {}", + idx, + e + ); WSError::WsDataError(WsDataError::DataSplitTaskError { - msg: format!("Failed to submit task: channel closed, error: {}", e) - }) - }) - } - - /// 等待所有已提交的写入任务完成 - /// 关闭发送端,不再接收新任务 - pub async fn wait_all_tasks(&self) -> WSResult<()> { - // 等待广播通知 - let mut rx = self.broadcast_tx.subscribe(); - rx.recv().await.map_err(|e| { - tracing::error!("Failed to wait for tasks: {}", e); - WSError::WsDataError(WsDataError::BatchTransferTaskFailed { - reason: format!("Failed to wait for tasks: {}", e) + msg: format!("Failed to submit task: channel closed, error: {}", e), }) })?; - - Ok(()) + if submited_size == self.totalsize { + Ok(false) + } else if submited_size > self.totalsize { + Err(WSError::WsDataError(WsDataError::DataSplitTaskError { + msg: format!( + "submited_size: {} > totalsize: {}", + submited_size, self.totalsize + ), + })) + } else { + Ok(true) + } } - // 在任务处理逻辑中保持发送端的引用 - pub async fn process_tasks(&mut self) -> WSResult<()> { - let _tx_holder = self.broadcast_tx.clone(); // 保持发送端存活 - - // ...任务处理逻辑... - - // 当所有任务完成,_tx_holder被释放,广播通道自动关闭 - Ok(()) -} + // / 等待所有已提交的写入任务完成 + // / 关闭发送端,不再接收新任务 + // pub async fn wait_all_tasks(&self) -> WSResult<()> { + // // 等待广播通知 + // let mut rx = self.broadcast_tx.subscribe(); + // rx.recv().await.map_err(|e| { + // tracing::error!("Failed to wait for tasks: {}", e); + // WSError::WsDataError(WsDataError::BatchTransferTaskFailed { + // reason: format!("Failed to wait for tasks: {}", e), + // }) + // })?; + + // Ok(()) + // } } #[derive(Debug)] pub enum DataItemSource { - Memory { - data: Vec, - }, - File { - path: PathBuf, - }, + Memory { data: Vec }, + File { path: PathBuf }, } impl DataItemSource { @@ -693,9 +849,7 @@ impl DataItemSource { Some(proto::data_item::DataItemDispatch::File(file_data)) => Self::File { path: file_data.file_name_opt.clone().into(), }, - _ => Self::Memory { - data: Vec::new(), - }, + _ => Self::Memory { data: Vec::new() }, } } @@ -703,13 +857,19 @@ impl DataItemSource { pub fn to_data_item(&self) -> proto::DataItem { match self { DataItemSource::Memory { data } => proto::DataItem { - data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(data.clone())), + data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes( + data.clone(), + )), }, DataItemSource::File { path } => proto::DataItem { - data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(proto::FileData { - file_name_opt: path.to_str().map_or_else(|| String::from(""), |s| s.to_string()), // 这里需要根据实际情况调整类型转换 - ..Default::default() // 假设 FileData 有其他字段,这里使用默认值 - })), + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File( + proto::FileData { + file_name_opt: path + .to_str() + .map_or_else(|| String::from(""), |s| s.to_string()), // 这里需要根据实际情况调整类型转换 + ..Default::default() // 假设 FileData 有其他字段,这里使用默认值 + }, + )), }, } } @@ -718,20 +878,20 @@ impl DataItemSource { match self { DataItemSource::Memory { data } => Ok(data.len()), DataItemSource::File { path } => { - let metadata = tokio::fs::metadata(path).await.map_err(|e| + let metadata = tokio::fs::metadata(path).await.map_err(|e| { WSError::WsDataError(WsDataError::BatchTransferFailed { request_id: proto::BatchRequestId { - node_id: 0, // 这里需要传入正确的node_id + node_id: 0, // 这里需要传入正确的node_id sequence: 0, }, reason: format!("Failed to get file size: {}", e), }) - )?; + })?; Ok(metadata.len() as usize) } } } - + // pub fn block_type(&self) -> proto::BatchDataBlockType { // match self { // DataItemSource::Memory { .. } => proto::BatchDataBlockType::Memory, @@ -750,12 +910,10 @@ impl DataItemSource { actual: 0, })) } - }, + } DataItemSource::File { path } => { let content = tokio::fs::read(path).await.map_err(|_e| { - WSError::WsDataError(WsDataError::ReadDataFailed { - path: path.clone(), - }) + WSError::WsDataError(WsDataError::ReadDataFailed { path: path.clone() }) })?; if block_idx == 0 { Ok(content) @@ -765,14 +923,19 @@ impl DataItemSource { actual: 0, })) } - }, + } } } } use crate::general::network::proto_ext::DataItemExt; +use super::{DataGeneralView, DATA_TMP_DIR}; + impl DataItemExt for DataItemSource { + fn inmem_size(&self) -> usize { + todo!() + } fn decode_persist(data: Vec) -> WSResult { if data.is_empty() { return Err(WSError::WsDataError(WsDataError::DataDecodeError { @@ -791,14 +954,14 @@ impl DataItemExt for DataItemSource { Ok(DataItemSource::File { path: PathBuf::from(path_str), }) - }, + } 1 => Ok(DataItemSource::Memory { data: data[1..].to_owned(), }), _ => Err(WSError::WsDataError(WsDataError::DataDecodeError { reason: format!("Unknown data item type id: {}", data[0]), data_type: "DataItemSource".to_string(), - })) + })), } } @@ -816,6 +979,14 @@ impl DataItemExt for DataItemSource { } } } + + fn get_data_type(&self) -> proto::data_item::DataItemDispatch { + todo!() + } + + fn into_data_bytes(self) -> Vec { + todo!() + } } #[derive(Debug, Clone)] @@ -825,9 +996,7 @@ enum DataItemZip { /// 不需要压缩(非目录) NoNeed, /// 已压缩的目录 - Directory { - zipped_file: PathBuf, - } + Directory { zipped_file: PathBuf }, } //派生显示特征 曾俊 @@ -839,49 +1008,67 @@ pub struct DataItemArgWrapper { } impl DataItemArgWrapper { - - // 根据传入的DataItem类型新建一个DataItemArgWrapper实例, tmpzipfile默认为Uninitialized。 曾俊 + pub fn filepath(&self) -> Option { + match &self.dataitem.data_item_dispatch { + Some(proto::data_item::DataItemDispatch::File(file_data)) => { + Some(file_data.file_name_opt.clone()) + } + _ => None, + } + } + // 根据传入的DataItem类型新建一个DataItemArgWrapper实例, tmpzipfile默认为Uninitialized。 曾俊 pub fn new(value: Vec) -> Self { DataItemArgWrapper { - dataitem:proto::DataItem {data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(value))}, + dataitem: proto::DataItem { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(value)), + }, tmpzipfile: DataItemZip::Uninitialized, } } - pub fn from_file(filepath: PathBuf) -> WSResult { - let view=DataItemView::new(try_get_modules_ref().map_err(|err|{ - tracing::error!("Failed to get modules ref: {}", err); - err - })?); + pub fn from_file(mref: LogicalModulesRef, filepath: PathBuf) -> WSResult { + let view = DataGeneralView::new(mref); //let abs_filepath=view.os().abs_file_path(filepath); //虞光勇修改 添加.clone() - let abs_filepath=view.os().abs_file_path(filepath.clone()); - - Ok(Self { - dataitem: proto::DataItem{ - data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(proto::FileData{ - is_dir_opt: abs_filepath.is_dir(), - file_name_opt: filepath.to_str().unwrap().to_string(), - file_content: vec![], - })), + let abs_filepath = view.os().abs_file_path(filepath.clone()); + + Ok(Self { + dataitem: proto::DataItem { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File( + proto::FileData { + is_dir_opt: abs_filepath.is_dir(), + file_name_opt: filepath.to_str().unwrap().to_string(), + file_content: vec![], + }, + )), }, tmpzipfile: DataItemZip::Uninitialized, }) } pub fn from_bytes(bytes: Vec) -> Self { - Self { - dataitem: proto::DataItem{ + Self { + dataitem: proto::DataItem { data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(bytes)), }, tmpzipfile: DataItemZip::Uninitialized, } } - pub async fn get_tmpzipfile(&mut self) -> WSResult> { + pub async fn get_tmpzipfile(&mut self, filepath: &PathBuf) -> WSResult> { match &self.tmpzipfile { DataItemZip::Uninitialized => { - self.init_tmpzipfile().await?; + self.init_tmpzipfile(filepath).await.map_err(|err| { + tracing::warn!( + "Failed to init tmpzipfile {} with error {}", + self.dataitem + .as_file_data() + .expect("only filedata need to be initialized") + .file_name_opt, + err + ); + err + })?; } _ => {} } @@ -893,7 +1080,8 @@ impl DataItemArgWrapper { } } - async fn init_tmpzipfile(&mut self) -> WSResult<()> { + // call this only when this dataitem is a directory + async fn init_tmpzipfile(&mut self, filepath: &PathBuf) -> WSResult<()> { // 确保只初始化一次 if !matches!(self.tmpzipfile, DataItemZip::Uninitialized) { return Ok(()); @@ -904,33 +1092,48 @@ impl DataItemArgWrapper { proto::data_item::DataItemDispatch::RawBytes(_) => { self.tmpzipfile = DataItemZip::NoNeed; return Ok(()); - }, + } }; + let filepath = filepath.join(&filedata.file_name_opt); // 检查目录元数据 - let metadata = tokio::fs::metadata(&filedata.file_name_opt).await.map_err(|e| { + let metadata = tokio::fs::metadata(&filepath).await.map_err(|e| { WSError::WsDataError(WsDataError::FileMetadataErr { - path: PathBuf::from(&filedata.file_name_opt), + path: filepath.clone(), err: e, }) })?; if metadata.is_dir() { let tmp_file = tempfile::NamedTempFile::new().map_err(|e| { - WSError::WsDataError(WsDataError::FileMetadataErr { - path: PathBuf::from(&filedata.file_name_opt), + WSError::WsDataError(WsDataError::TransferDirCreateTmpFileFailed { + path: filepath.clone(), err: e, + context: "init_tmpzipfile".to_string(), + }) + })?; + // let tmp_path = tmp_file.path().to_path_buf(); + let (mut tmp_file, tmp_path) = tmp_file.keep().map_err(|err| { + tracing::error!("Failed to keep tmp_file: {}", err); + WSError::WsDataError(WsDataError::TransferDirPersistTmpFileKeepFailed { + path: filepath.clone(), + err: err, + context: "init_tmpzipfile".to_string(), }) })?; - let tmp_path = tmp_file.path().to_path_buf(); - + // 压缩目录到临时文件 crate::util::zip::zip_dir_2_file( - &filedata.file_name_opt, + &filepath, //zip::CompressionMethod::Stored, - CompressionMethod::Stored,//(续)虞光勇修改,修改内容删除zip:: - tmp_file.into_file(), - ).await?; + CompressionMethod::Stored, //(续)虞光勇修改,修改内容删除zip:: + &mut tmp_file, + ) + .await?; + + // debug zip file size + let metadata = tokio::fs::metadata(&tmp_path).await?; + tracing::info!("zip file size: {}", metadata.len()); self.tmpzipfile = DataItemZip::Directory { zipped_file: tmp_path, @@ -942,7 +1145,9 @@ impl DataItemArgWrapper { Ok(()) } - pub async fn transfer_size(&mut self) -> WSResult { + /// get file sized + /// files supposed to be all compressed if return no error + pub async fn get_data_size(&mut self, filepath: &PathBuf) -> WSResult { match &self.dataitem.data_item_dispatch { Some(proto::data_item::DataItemDispatch::RawBytes(bytes)) => return Ok(bytes.len()), Some(proto::data_item::DataItemDispatch::File(_)) => { @@ -951,69 +1156,93 @@ impl DataItemArgWrapper { None => return Ok(0), } - if let Some(tmp_path) = self.get_tmpzipfile().await? { + if let Some(tmp_path) = self.get_tmpzipfile(filepath).await? { + // tracing::info!("tmp_path: {:?}, please debug file in 30 seconds", tmp_path); + // tokio::time::sleep(Duration::from_secs(30)).await; let metadata = tokio::fs::metadata(tmp_path).await?; Ok(metadata.len() as usize) } else { - let file_data=match &self.dataitem.data_item_dispatch { + let file_data = match &self.dataitem.data_item_dispatch { Some(proto::data_item::DataItemDispatch::File(file_data)) => { // handle in following file_data } - Some(proto::data_item::DataItemDispatch::RawBytes(_)) | None=>{panic!("these case should be handled in previous match")} + Some(proto::data_item::DataItemDispatch::RawBytes(_)) | None => { + panic!("these case should be handled in previous match") + } }; let metadata = tokio::fs::metadata(&file_data.file_name_opt).await?; Ok(metadata.len() as usize) } } - pub async fn clone_split_range(&mut self, range: Range) -> WSResult { + pub async fn clone_split_range( + &mut self, + filepath: &PathBuf, + range: Range, + ) -> WSResult { match &self.dataitem.data_item_dispatch { Some(proto::data_item::DataItemDispatch::RawBytes(bytes)) => { - return Ok(proto::DataItem::new_partial_raw_bytes(bytes.to_owned(), range).map_err(|err|{ - tracing::error!("Failed to clone split range: {}", err); - err - })?) + return Ok( + proto::DataItem::new_partial_raw_bytes(bytes.to_owned(), range).map_err( + |err| { + tracing::error!("Failed to clone split range: {}", err); + err + }, + )?, + ) } Some(proto::data_item::DataItemDispatch::File(_)) => { - + // handle in following } None => panic!("proto dataitem must be Some"), } - - fn get_filedata(dataitem:&DataItemArgWrapper)->&proto::FileData{ + + fn get_filedata(dataitem: &DataItemArgWrapper) -> &proto::FileData { match &dataitem.dataitem.data_item_dispatch { Some(proto::data_item::DataItemDispatch::File(file_data)) => file_data, - Some(proto::data_item::DataItemDispatch::RawBytes(_)) | None=>{panic!("these case should be handled in previous match")} + Some(proto::data_item::DataItemDispatch::RawBytes(_)) | None => { + panic!("these case should be handled in previous match") + } } } // if zipped, use zipped file // else use file_data.file_name_opt - if let Some(tmp_path) = self.get_tmpzipfile().await?.cloned() { - let file_data=get_filedata(self); - Ok(proto::DataItem::new_partial_file_data(NewPartialFileDataArg::FilePath { path: PathBuf::from_str(&file_data.file_name_opt).map_err(|err|{ - let err=WsDataError::FilePathParseErr { - path: file_data.file_name_opt.clone(), - err: err, - }; - tracing::error!("Failed to clone split range: {:?}", err); - err - })? , zip_path: Some(tmp_path.clone()) }, range).await.map_err(|err|{ + if let Some(tmp_path) = self.get_tmpzipfile(filepath).await?.cloned() { + let file_data = get_filedata(self); + Ok(proto::DataItem::new_partial_file_data( + NewPartialFileDataArg::FilePath { + basepath: filepath.clone(), + path: PathBuf::from(file_data.file_name_opt.clone()), + zip_path: Some(tmp_path.clone()), + }, + range, + ) + .await + .map_err(|err| { tracing::error!("Failed to clone split range: {}", err); err })?) } else { - let file_data=get_filedata(self); - Ok(proto::DataItem::new_partial_file_data(NewPartialFileDataArg::FilePath { path: PathBuf::from_str(&file_data.file_name_opt).map_err(|err|{ - let err=WsDataError::FilePathParseErr { - path: file_data.file_name_opt.clone(), - err: err, - }; - tracing::error!("Failed to clone split range: {:?}", err); - err - })? , zip_path: None }, range).await.map_err(|err|{ + let file_data = get_filedata(self); + // let path = ; + // if !path.exists() { + // return Err(WSError::WsDataError(WsDataError::FileNotFound { + // path: path.clone(), + // })); + // } + Ok(proto::DataItem::new_partial_file_data( + NewPartialFileDataArg::FilePath { + basepath: filepath.clone(), + path: PathBuf::from(file_data.file_name_opt.clone()), + zip_path: None, + }, + range, + ) + .await + .map_err(|err| { tracing::error!("Failed to clone split range: {}", err); err })?) diff --git a/src/main/src/general/data/m_data_general/mod.rs b/src/main/src/general/data/m_data_general/mod.rs index 250da75..2fe2c02 100644 --- a/src/main/src/general/data/m_data_general/mod.rs +++ b/src/main/src/general/data/m_data_general/mod.rs @@ -1,15 +1,21 @@ /// 缓存模式类型 pub type CacheMode = u16; -pub mod dataitem; pub mod batch; pub mod batch_handler; +pub mod dataitem; -use crate::general::data::m_data_general::dataitem::{calculate_splits, WantIdxIter, WriteSplitDataTaskGroup, DataItemSource}; -use crate::general::data::m_data_general::batch_handler::{BatchReceiveState, SharedWithBatchHandler}; -use crate::general::network::proto::DataItem; +use crate::general::data::m_data_general::batch_handler::{ + BatchReceiveState, SharedWithBatchHandler, +}; +use crate::general::data::m_data_general::dataitem::{ + calculate_splits, DataItemSource, WantIdxIter, WriteSplitDataTaskGroup, +}; +use crate::general::network::proto::{DataItem, FnTaskId}; +use batch_handler::GetOrDelType; use dataitem::{DataItemArgWrapper, WriteSplitTaskResult}; -use tokio::io::{AsyncSeekExt, AsyncReadExt}; +use tokio::fs; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; use crate::general::{ data::m_kv_store_engine::{ @@ -18,21 +24,19 @@ use crate::general::{ m_os::OperatingSystem, network::{ m_p2p::{P2PModule, RPCCaller, RPCHandler, RPCResponsor}, - proto::{ - self, DataMeta, WriteOneDataResponse, - }, + proto::{self, DataMeta, WriteOneDataResponse}, proto_ext::ProtoExtDataItem, }, }; use crate::{ general::{ data::m_kv_store_engine::{KeyLockGuard, KeyType}, - network::{proto_ext::DataItemExt}, + network::proto_ext::DataItemExt, }, logical_module_view_impl, - result::{WSError, WSResult, WSResultExt, WsSerialErr, WsNetworkLogicErr}, + result::{WSError, WSResult, WSResultExt, WsNetworkLogicErr, WsSerialErr}, sys::{LogicalModule, LogicalModuleNewArgs, NodeID}, - util::{JoinHandleWrapper, container::async_init_map::AsyncInitMap}, + util::{container::async_init_map::AsyncInitMap, JoinHandleWrapper}, }; use crate::{result::WsDataError, sys::LogicalModulesRef}; use async_trait::async_trait; @@ -42,15 +46,14 @@ use core::str; use serde::{Deserialize, Serialize}; use std::{ collections::{BTreeSet, HashMap, HashSet}, + sync::atomic::{AtomicU32, Ordering}, sync::Arc, time::Duration, - sync::atomic::{AtomicU32, Ordering}, }; use tokio::sync::Semaphore; use tokio::task::JoinError; use ws_derive::LogicalModule; - // 费新文 // use crate::general::network::proto::sche::{DistributeTaskReq, DistributeTaskResp}; // use crate::general::app::app_native::NativeAppInstance; @@ -61,10 +64,6 @@ use ws_derive::LogicalModule; // FnExeCtxAsyncAllowedType, // }; - - - - logical_module_view_impl!(DataGeneralView); logical_module_view_impl!(DataGeneralView, p2p, P2PModule); logical_module_view_impl!(DataGeneralView, data_general, DataGeneral); @@ -77,9 +76,6 @@ pub type DataItemIdx = u8; pub const DATA_UID_PREFIX_APP_META: &str = "app"; pub const DATA_UID_PREFIX_FN_KV: &str = "fkv"; -/// 默认数据块大小 (4MB) -pub const DEFAULT_BLOCK_SIZE: usize = 4 * 1024 * 1024; - pub const CACHE_MODE_TIME_MASK: u16 = 0xf000; pub const CACHE_MODE_TIME_FOREVER_MASK: u16 = 0x0fff; pub const CACHE_MODE_TIME_AUTO_MASK: u16 = 0x1fff; @@ -94,6 +90,18 @@ pub const CACHE_MODE_MAP_COMMON_KV_MASK: u16 = 0xff0f; pub const CACHE_MODE_MAP_FILE_MASK: u16 = 0xff1f; // const DATA_UID_PREFIX_OBJ: &str = "obj"; +pub fn parse_appname_from_data_uid(data_uid: &[u8]) -> Option { + if data_uid.starts_with(DATA_UID_PREFIX_APP_META.as_bytes()) { + Some( + str::from_utf8(&data_uid[DATA_UID_PREFIX_APP_META.len()..]) + .unwrap_or_default() + .to_string(), + ) + } else { + None + } +} + pub fn new_data_unique_id_app(app_name: &str) -> String { format!("{}{}", DATA_UID_PREFIX_APP_META, app_name) } @@ -120,8 +128,6 @@ pub struct DataGeneral { //费新文 // rpc_call_distribute_task: RPCCaller, - - rpc_handler_write_once_data: RPCHandler, rpc_handler_batch_data: RPCHandler, rpc_handler_data_meta_update: RPCHandler, @@ -131,44 +137,14 @@ pub struct DataGeneral { //费新文 // rpc_handler_distribute_task: RPCHandler, - // 批量数据接收状态管理 batch_receive_states: AsyncInitMap>, + + // cache in memory + cache_in_memory: moka::sync::Cache>, } impl DataGeneral { - pub fn inner_new(args: LogicalModuleNewArgs) -> Self { - Self { - view: DataGeneralView::new(args.logical_modules_ref.clone()), - rpc_call_data_version_schedule: RPCCaller::new(), - rpc_call_write_once_data: RPCCaller::new(), - rpc_call_batch_data: RPCCaller::new(), - rpc_call_get_data_meta: RPCCaller::new(), - rpc_call_get_data: RPCCaller::new(), - - //费新文 - // rpc_call_distribute_task: RPCCaller::new(), - - - rpc_handler_write_once_data: RPCHandler::new(), - rpc_handler_batch_data: RPCHandler::new(), - rpc_handler_data_meta_update: RPCHandler::new(), - rpc_handler_get_data_meta: RPCHandler::new(), - rpc_handler_get_data: RPCHandler::new(), - batch_receive_states: AsyncInitMap::new(), - - //费新文 - // rpc_handler_distribute_task: RPCHandler::new(), - - } - } - - #[allow(dead_code)] - fn next_batch_id(&self) -> u32 { - static NEXT_BATCH_ID: AtomicU32 = AtomicU32::new(1); // 从1开始,保留0作为特殊值 - NEXT_BATCH_ID.fetch_add(1, Ordering::Relaxed) - } - pub async fn write_data_batch( &self, unique_id: UniqueId, @@ -188,49 +164,65 @@ impl DataGeneral { ) -> WSResult<()> { let (tx, mut rx) = tokio::sync::mpsc::channel(32); let mut handles = Vec::new(); - + let data_size = data.size().await?; let splits = calculate_splits(data_size); - - tracing::debug!("batch_transfer total size({}), splits: {:?}, to node {}", data_size, splits, target_node); + + tracing::debug!( + "batch_transfer total size({}), splits: {:?}, to node {}", + data_size, + splits, + target_node + ); for (block_idx, split_range) in splits.iter().enumerate() { let block_data = match data.as_ref() { DataItemSource::Memory { data } => data[split_range.clone()].to_vec(), DataItemSource::File { path } => { // 读取文件对应块的数据 - let mut file = tokio::fs::File::open(path).await.map_err(|e| WsDataError::BatchTransferFailed { - request_id: proto::BatchRequestId { - node_id: target_node as u32, - sequence: block_idx as u64, - }, - reason: format!("Failed to open file: {}", e), + let mut file = tokio::fs::File::open(path).await.map_err(|e| { + WsDataError::BatchTransferFailed { + request_id: proto::BatchRequestId { + node_id: target_node as u32, + sequence: block_idx as u64, + }, + reason: format!("Failed to open file: {}", e), + } })?; let mut buffer = vec![0; split_range.len()]; // 验证seek结果 - let seek_pos = file.seek(std::io::SeekFrom::Start(split_range.start as u64)).await.map_err(|e| WsDataError::BatchTransferFailed { - request_id: proto::BatchRequestId { - node_id: target_node as u32, - sequence: block_idx as u64, - }, - reason: format!("Failed to seek file: {}", e), - })?; + let seek_pos = file + .seek(std::io::SeekFrom::Start(split_range.start as u64)) + .await + .map_err(|e| WsDataError::BatchTransferFailed { + request_id: proto::BatchRequestId { + node_id: target_node as u32, + sequence: block_idx as u64, + }, + reason: format!("Failed to seek file: {}", e), + })?; if seek_pos != split_range.start as u64 { return Err(WsDataError::BatchTransferFailed { request_id: proto::BatchRequestId { node_id: target_node as u32, sequence: block_idx as u64, }, - reason: format!("Seek position mismatch: expected {}, got {}", split_range.start, seek_pos), - }.into()); + reason: format!( + "Seek position mismatch: expected {}, got {}", + split_range.start, seek_pos + ), + } + .into()); } // read_exact保证读取指定长度的数据或返回错误 - let _ = file.read_exact(&mut buffer).await.map_err(|e| WsDataError::BatchTransferFailed { - request_id: proto::BatchRequestId { - node_id: target_node as u32, - sequence: block_idx as u64, - }, - reason: format!("Failed to read file: {}", e), + let _ = file.read_exact(&mut buffer).await.map_err(|e| { + WsDataError::BatchTransferFailed { + request_id: proto::BatchRequestId { + node_id: target_node as u32, + sequence: block_idx as u64, + }, + reason: format!("Failed to read file: {}", e), + } })?; buffer } @@ -245,11 +237,19 @@ impl DataGeneral { data_item_idx: data_item_idx as u32, // 用空的 DataItem 代替 block_type: match data.as_ref() { - DataItemSource::Memory { .. } => Some(proto::DataItem{ - data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(Vec::new())), + DataItemSource::Memory { .. } => Some(proto::DataItem { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes( + Vec::new(), + )), }), - DataItemSource::File { .. } => Some(proto::DataItem{ - data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(proto::FileData { file_name_opt: String::new(), is_dir_opt: true, file_content: Vec::new() })), + DataItemSource::File { .. } => Some(proto::DataItem { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File( + proto::FileData { + file_name_opt: String::new(), + is_dir_opt: true, + file_content: Vec::new(), + }, + )), }), }, block_index: block_idx as u32, @@ -259,12 +259,13 @@ impl DataGeneral { version, total_size: data_size as u64, }; - + let tx = tx.clone(); let view = view.clone(); - + let handle = tokio::spawn(async move { - let result = view.data_general() + let result = view + .data_general() .rpc_call_batch_data .call( view.p2p(), @@ -273,17 +274,17 @@ impl DataGeneral { Some(Duration::from_secs(30)), ) .await; - + if let Err(e) = tx.send(result).await { tracing::error!("Failed to send batch transfer result: {}", e); } }); - + handles.push(handle); } - + drop(tx); - + while let Some(result) = rx.recv().await { match result { Ok(resp) if !resp.success => { @@ -293,7 +294,8 @@ impl DataGeneral { sequence: 0, // TODO: Add proper sequence number }, reason: resp.error_message, - }.into()); + } + .into()); } Ok(_) => continue, Err(e) => { @@ -303,20 +305,19 @@ impl DataGeneral { sequence: 0, }, reason: format!("RPC call failed: {}", e), - }.into()); + } + .into()); } } } for handle in handles { - handle.await.map_err(|e| { - WsDataError::BatchTransferFailed { - request_id: proto::BatchRequestId { - node_id: target_node as u32, - sequence: 0, - }, - reason: format!("Task join failed: {}", e), - } + handle.await.map_err(|e| WsDataError::BatchTransferFailed { + request_id: proto::BatchRequestId { + node_id: target_node as u32, + sequence: 0, + }, + reason: format!("Task join failed: {}", e), })?; } @@ -324,16 +325,28 @@ impl DataGeneral { } let data = Arc::new(data.to_data_item_source()); - batch_transfer(data_item_idx,unique_id, version, node_id, data, self.view.clone()).await + batch_transfer( + data_item_idx, + unique_id, + version, + node_id, + data, + self.view.clone(), + ) + .await } - pub async fn get_or_del_datameta_from_master( &self, unique_id: &[u8], delete: bool, ) -> WSResult { - tracing::debug!("get_or_del_datameta_from_master uid: {:?}, delete: {}, whoami: {}", unique_id, delete, self.view.p2p().nodes_config.this.0); + tracing::debug!( + "get_or_del_datameta_from_master uid: {:?}, delete: {}, whoami: {}", + unique_id, + delete, + self.view.p2p().nodes_config.this.0 + ); let p2p = self.view.p2p(); // get meta from master let meta = self @@ -365,7 +378,7 @@ impl DataGeneral { }) } - pub async fn get_or_del_data( + pub async fn get_or_del_datas( &self, GetOrDelDataArg { meta, @@ -373,8 +386,12 @@ impl DataGeneral { ty, }: GetOrDelDataArg, ) -> WSResult<(DataSetMetaV2, HashMap)> { - tracing::debug!("get_or_del_data uid: {:?}, maybe with meta: {:?}", unique_id, meta); - let mut data_map = HashMap::new(); + tracing::debug!( + "get_or_del_data uid: {:?}, maybe with meta: {:?}", + unique_id, + meta + ); + // let mut data_map = HashMap::new(); // get meta from master let meta = if let Some(meta) = meta { @@ -384,167 +401,192 @@ impl DataGeneral { .await? }; - tracing::debug!("start get_or_del_data uid: {:?},meta: {:?}", unique_id, meta); - - // basical verify - for idx in 0..meta.data_item_cnt() { - let idx = idx as DataItemIdx; - let check_cache_map = |meta: &DataSetMetaV2| -> WSResult<()> { - if !meta.cache_mode_visitor(idx).is_map_common_kv() - && !meta.cache_mode_visitor(idx).is_map_file() - { - return Err(WsDataError::UnknownCacheMapMode { - mode: meta.cache_mode_visitor(idx).0, - } - .into()); - } - Ok(()) - }; - check_cache_map(&meta)?; - } - - // get data - let p2p = self.view.p2p(); - - match ty { - GetOrDelDataArgType::All => { - for idx in 0..meta.data_item_cnt() { - let idx = idx as DataItemIdx; - let resp = self - .rpc_call_get_data - .call( - p2p, - meta.get_data_node(idx), - proto::GetOneDataRequest { - unique_id: unique_id.to_vec(), - idxs: vec![idx as u32], - delete: false, - return_data: true, - }, - Some(Duration::from_secs(60)), - ) - .await?; - - if !resp.success { - return Err(WsDataError::GetDataFailed { - unique_id: unique_id.to_vec(), - msg: resp.message, - } - .into()); - } - - let _ = data_map.insert(idx, resp.data[0].clone()); - } - } - GetOrDelDataArgType::Delete => { - for idx in 0..meta.data_item_cnt() { - let idx = idx as DataItemIdx; - let resp = self - .rpc_call_get_data - .call( - p2p, - meta.get_data_node(idx), - proto::GetOneDataRequest { - unique_id: unique_id.to_vec(), - idxs: vec![idx as u32], - delete: true, - return_data: true, - }, - Some(Duration::from_secs(60)), - ) - .await?; - - if !resp.success { - return Err(WsDataError::GetDataFailed { - unique_id: unique_id.to_vec(), - msg: resp.message, - } - .into()); - } - - let _ = data_map.insert(idx, resp.data[0].clone()); - } - } - GetOrDelDataArgType::PartialOne { idx } => { - let resp = self - .rpc_call_get_data - .call( - p2p, - meta.get_data_node(idx), - proto::GetOneDataRequest { - unique_id: unique_id.to_vec(), - idxs: vec![idx as u32], - delete: false, - return_data: true, - }, - Some(Duration::from_secs(60)), - ) - .await?; - - if !resp.success { - return Err(WsDataError::GetDataFailed { - unique_id: unique_id.to_vec(), - msg: resp.message, - } - .into()); - } - - let _ = data_map.insert(idx, resp.data[0].clone()); - } - GetOrDelDataArgType::PartialMany { idxs } => { - for idx in idxs { - let resp = self - .rpc_call_get_data - .call( - p2p, - meta.get_data_node(idx), - proto::GetOneDataRequest { - unique_id: unique_id.to_vec(), - idxs: vec![idx as u32], - delete: false, - return_data: true, - }, - Some(Duration::from_secs(60)), - ) - .await?; - - if !resp.success { - return Err(WsDataError::GetDataFailed { - unique_id: unique_id.to_vec(), - msg: resp.message, - } - .into()); - } - - let _ = data_map.insert(idx, resp.data[0].clone()); - } - } - } + tracing::debug!( + "start get_or_del_data uid: {:?},meta: {:?}", + unique_id, + meta + ); - Ok((meta, data_map)) + let idxs: Vec = WantIdxIter::new(&ty, meta.data_item_cnt() as u8).collect(); + let res = self + .batch_get_or_del_data( + unique_id, + &meta, + &idxs, + if ty.is_delete() { + GetOrDelType::DelReturnData + } else { + GetOrDelType::Get + }, + ) + .await + .todo_handle("batch_get_or_del_data err")? + .expect("current we want the result all the time"); + + let resmap = idxs.into_iter().zip(res).collect(); + Ok((meta, resmap)) + // // basical verify + // for idx in 0..meta.data_item_cnt() { + // let idx = idx as DataItemIdx; + // let check_cache_map = |meta: &DataSetMetaV2| -> WSResult<()> { + // if !meta.cache_mode_visitor(idx).is_map_common_kv() + // && !meta.cache_mode_visitor(idx).is_map_file() + // { + // return Err(WsDataError::UnknownCacheMapMode { + // mode: meta.cache_mode_visitor(idx).0, + // } + // .into()); + // } + // Ok(()) + // }; + // check_cache_map(&meta)?; + // } + + // // get data + // let p2p = self.view.p2p(); + + // match ty { + // GetOrDelDataArgType::All => { + // for idx in 0..meta.data_item_cnt() { + // let idx = idx as DataItemIdx; + // let resp = self + // .rpc_call_get_data + // .call( + // p2p, + // meta.get_data_node(idx), + // proto::GetOneDataRequest { + // unique_id: unique_id.to_vec(), + // idxs: vec![idx as u32], + // delete: false, + // return_data: true, + // }, + // Some(Duration::from_secs(60)), + // ) + // .await?; + + // if !resp.success { + // return Err(WsDataError::GetDataFailed { + // unique_id: unique_id.to_vec(), + // msg: resp.message, + // } + // .into()); + // } + + // let _ = data_map.insert(idx, resp.data[0].clone()); + // } + // } + // GetOrDelDataArgType::Delete => { + // for idx in 0..meta.data_item_cnt() { + // let idx = idx as DataItemIdx; + // let resp = self + // .rpc_call_get_data + // .call( + // p2p, + // meta.get_data_node(idx), + // proto::GetOneDataRequest { + // unique_id: unique_id.to_vec(), + // idxs: vec![idx as u32], + // delete: true, + // return_data: true, + // }, + // Some(Duration::from_secs(60)), + // ) + // .await?; + + // if !resp.success { + // return Err(WsDataError::GetDataFailed { + // unique_id: unique_id.to_vec(), + // msg: resp.message, + // } + // .into()); + // } + + // let _ = data_map.insert(idx, resp.data[0].clone()); + // } + // } + // GetOrDelDataArgType::PartialOne { idx } => { + // let resp = self + // .rpc_call_get_data + // .call( + // p2p, + // meta.get_data_node(idx), + // proto::GetOneDataRequest { + // unique_id: unique_id.to_vec(), + // idxs: vec![idx as u32], + // delete: false, + // return_data: true, + // }, + // Some(Duration::from_secs(60)), + // ) + // .await?; + + // if !resp.success { + // return Err(WsDataError::GetDataFailed { + // unique_id: unique_id.to_vec(), + // msg: resp.message, + // } + // .into()); + // } + + // let _ = data_map.insert(idx, resp.data[0].clone()); + // } + // GetOrDelDataArgType::PartialMany { idxs } => { + // for idx in idxs { + // let resp = self + // .rpc_call_get_data + // .call( + // p2p, + // meta.get_data_node(idx), + // proto::GetOneDataRequest { + // unique_id: unique_id.to_vec(), + // idxs: vec![idx as u32], + // delete: false, + // return_data: true, + // }, + // Some(Duration::from_secs(60)), + // ) + // .await?; + + // if !resp.success { + // return Err(WsDataError::GetDataFailed { + // unique_id: unique_id.to_vec(), + // msg: resp.message, + // } + // .into()); + // } + + // let _ = data_map.insert(idx, resp.data[0].clone()); + // } + // } + // } + + // Ok((meta, data_map)) } pub async fn write_data( &self, unique_id: impl Into>, mut datas: Vec, - context_openode_opetype_operole: Option<( + context_openode_opetype_operole_src: Option<( NodeID, proto::DataOpeType, proto::data_schedule_context::OpeRole, + proto::FnTaskId, )>, ) -> WSResult<()> { let unique_id = unique_id.into(); let log_tag = format!("[write_data({})]", String::from_utf8_lossy(&unique_id)); tracing::debug!("{} start write data", log_tag); - - let mut data_transfer_sizes=Vec::new(); + + let mut data_transfer_sizes = Vec::new(); data_transfer_sizes.reserve(datas.len()); - for d in datas.iter_mut(){ - data_transfer_sizes.push(d.transfer_size().await.map_err(|err|{ - tracing::error!("{} transfer size error: {}", log_tag, err); - err - })?); + for d in datas.iter_mut() { + data_transfer_sizes.push(d.get_data_size(&self.view.os().file_path).await.map_err( + |err| { + tracing::error!("{} transfer size error: {}", log_tag, err); + err + }, + )?); } // 获取数据调度计划 let version_schedule_resp = self @@ -554,15 +596,25 @@ impl DataGeneral { self.view.p2p().nodes_config.get_master_node(), proto::DataVersionScheduleRequest { unique_id: unique_id.clone(), - context: context_openode_opetype_operole.map(|(node, ope, role)| { - proto::DataScheduleContext { - // each_data_sz_bytes: data_transfer_sizes, 原代码类型不匹配 曾俊 - each_data_sz_bytes: data_transfer_sizes.iter().map(|&x| x as u32).collect(), - ope_node: node as i64, - ope_type: ope as i32, - ope_role: Some(role), - } - }), + context: context_openode_opetype_operole_src.map( + |(node, ope, role, src_task_id)| { + proto::DataScheduleContext { + // each_data_sz_bytes: data_transfer_sizes, 原代码类型不匹配 曾俊 + each_data_sz_bytes: data_transfer_sizes + .iter() + .map(|&x| x as u32) + .collect(), + ope_node: node as i64, + ope_type: ope as i32, + ope_role: Some(role), + src_task_id: Some(src_task_id), + filepaths: datas + .iter() + .map(|d| d.filepath().unwrap_or_default()) + .collect(), + } + }, + ), version: 0, }, Some(Duration::from_secs(60)), @@ -579,24 +631,47 @@ impl DataGeneral { let data_item: &DataItemArgWrapper = &mut datas[data_item_idx as usize]; let split = &splits[data_item_idx as usize]; let mut primary_tasks = Vec::new(); - + // 1. 并行写入所有主数据分片 - let mut split_iter = WantIdxIter::new(&GetOrDelDataArgType::All, split.splits.len() as u8); + let mut split_iter = + WantIdxIter::new(&GetOrDelDataArgType::All, split.splits.len() as u8); while let Some(split_idx) = split_iter.next() { let split_info = &split.splits[split_idx as usize]; - tracing::debug!("{} creating split write task {}/{} for node {}, offset={}, size={}", - log_tag, split_idx + 1, split.splits.len(), split_info.node_id, split_info.data_offset, split_info.data_size); + tracing::debug!( + "{} creating split write task {}/{} for node {}, offset={}, size={}", + log_tag, + split_idx + 1, + split.splits.len(), + split_info.node_id, + split_info.data_offset, + split_info.data_size + ); let split_info = split_info.clone(); let unique_id_clone = unique_id.clone(); // let data_item_primary = data_item.clone_split_range(split_info.data_offset..split_info.data_offset+split_info.data_size); 类型不匹配 曾俊 // 生成一个复制的可变数据项 let mut data_item_clone = (*data_item).clone(); - let data_item_primary = data_item_clone.clone_split_range(split_info.data_offset as usize..(split_info.data_offset+split_info.data_size)as usize).await.todo_handle("clone_split_range for write data err")?; + let data_item_primary = data_item_clone + .clone_split_range( + &self.view.os().file_path, + split_info.data_offset as usize + ..(split_info.data_offset + split_info.data_size) as usize, + ) + .await + .todo_handle("clone_split_range for write data err")?; + + #[cfg(test)] + { + tracing::debug!( + "data_item_primary partial: {:?}", + &data_item_primary.clone().into_data_bytes()[0..30] + ); + } // let data_item_primary = data_item.clone_split_range(split_info.data_offset as usize..(split_info.data_offset+split_info.data_size)as usize).await.todo_handle("clone_split_range for write data err")?; let view = self.view.clone(); let version_copy = version; let task = tokio::spawn(async move { - view.data_general() + view.data_general() .rpc_call_write_once_data .call( view.p2p(), @@ -618,50 +693,71 @@ impl DataGeneral { } // 2. 并行写入缓存数据(完整数据) - let visitor = CacheModeVisitor(version_schedule_resp.cache_mode[data_item_idx as usize] as u16); - let need_cache = visitor.is_map_common_kv() || visitor.is_map_file(); - let cache_nodes: Vec = if need_cache { - split.splits.iter().map(|s| s.node_id).collect() - } else { - vec![] - }; - - let mut cache_tasks = Vec::new(); - if !cache_nodes.is_empty() { - tracing::debug!("{} found {} cache nodes: {:?}", log_tag, cache_nodes.len(), cache_nodes); - const MAX_CONCURRENT_TRANSFERS: usize = 3; - let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_TRANSFERS)); - - let mut cache_iter = WantIdxIter::new(&GetOrDelDataArgType::All, cache_nodes.len() as u8); - while let Some(cache_idx) = cache_iter.next() { - let node_id = cache_nodes[cache_idx as usize]; - let permit = semaphore.clone().acquire_owned().await.unwrap(); - tracing::debug!("{} creating cache write task {}/{} for node {}", log_tag, cache_idx + 1, cache_nodes.len(), node_id); - let unique_id_clone = unique_id.clone(); - let data_item_cache = data_item.clone(); - let view = self.view.clone(); - let task = tokio::spawn(async move { - let _permit = permit; // 持有permit直到任务完成 - view.data_general() - // .write_data_batch(unique_id_clone.clone(), version, data_item_cache, data_item_idx, node_id) //类型不匹配 曾俊 - .write_data_batch(unique_id_clone.clone(), version, data_item_cache.dataitem, data_item_idx, node_id) - .await?; - Ok::(proto::WriteOneDataResponse { - remote_version: version, - success: true, - message: String::new(), - }) - }); - cache_tasks.push(task); - } - } + // let visitor = + // CacheModeVisitor(version_schedule_resp.cache_mode[data_item_idx as usize] as u16); + // let need_cache = visitor.is_map_common_kv() || visitor.is_map_file(); + // let cache_nodes: Vec = if need_cache { + // split.splits.iter().map(|s| s.node_id).collect() + // } else { + // vec![] + // }; + + // let mut cache_tasks = Vec::new(); + // if !cache_nodes.is_empty() { + // tracing::debug!( + // "{} found {} cache nodes: {:?}", + // log_tag, + // cache_nodes.len(), + // cache_nodes + // ); + // const MAX_CONCURRENT_TRANSFERS: usize = 3; + // let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_TRANSFERS)); + + // let mut cache_iter = + // WantIdxIter::new(&GetOrDelDataArgType::All, cache_nodes.len() as u8); + // while let Some(cache_idx) = cache_iter.next() { + // let node_id = cache_nodes[cache_idx as usize]; + // let permit = semaphore.clone().acquire_owned().await.unwrap(); + // tracing::debug!( + // "{} creating cache write task {}/{} for node {}", + // log_tag, + // cache_idx + 1, + // cache_nodes.len(), + // node_id + // ); + // let unique_id_clone = unique_id.clone(); + // let data_item_cache = data_item.clone(); + // let view = self.view.clone(); + // let task = tokio::spawn(async move { + // let _permit = permit; // 持有permit直到任务完成 + // view.data_general() + // // .write_data_batch(unique_id_clone.clone(), version, data_item_cache, data_item_idx, node_id) //类型不匹配 曾俊 + // .write_data_batch( + // unique_id_clone.clone(), + // version, + // data_item_cache.dataitem, + // data_item_idx, + // node_id, + // ) + // .await?; + // Ok::(proto::WriteOneDataResponse { + // remote_version: version, + // success: true, + // message: String::new(), + // }) + // }); + // cache_tasks.push(task); + // } + // } let primary_results = futures::future::join_all(primary_tasks).await; tracing::debug!("{} primary_results: {:?}", log_tag, primary_results); - let cache_results = futures::future::join_all(cache_tasks).await; - tracing::debug!("{} cache_results: {:?}", log_tag, cache_results); + // let cache_results = futures::future::join_all(cache_tasks).await; + // tracing::debug!("{} cache_results: {:?}", log_tag, cache_results); - if primary_results.iter().any(|res| res.is_err()) || cache_results.iter().any(|res| res.is_err()) { + if primary_results.iter().any(|res| res.is_err()) + // || cache_results.iter().any(|res| res.is_err()) + { let error_msg = format!("主节点或缓存节点数据写入失败"); tracing::error!("{}", error_msg); return Err(WSError::WsDataError(WsDataError::WriteDataFailed { @@ -691,31 +787,32 @@ impl DataGeneral { let fail_by_overwrite = || async { let message = "New data version overwrite".to_owned(); tracing::warn!("{}", message); - - if let Err(e) = responsor //返回结果未处理 曾俊 + + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(WriteOneDataResponse { remote_version: 0, success: false, message, }) - .await{ - tracing::error!("Failed to send write one data response 1: {}", e); - } + .await + { + tracing::error!("Failed to send write one data response 1: {}", e); + } // .todo_handle("1 err_comment waitting to fill"); - }; let fail_with_msg = |message: String| async { tracing::warn!("{}", message); - if let Err(e) = responsor //返回结果未处理 曾俊 + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(WriteOneDataResponse { remote_version: 0, success: false, message, }) - .await { - tracing::error!("Failed to send write one data response 2 : {}", e); - } - // .todo_handle("2 err_comment waitting to fill"); + .await + { + tracing::error!("Failed to send write one data response 2 : {}", e); + } + // .todo_handle("2 err_comment waitting to fill"); }; loop { @@ -816,7 +913,7 @@ impl DataGeneral { || check_meta.as_ref().unwrap().0 != required_meta.as_ref().unwrap().0 { drop(guard); - if let Err(e) = responsor //返回结果未处理 曾俊 + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(WriteOneDataResponse { remote_version: if check_meta.is_none() { 0 @@ -826,9 +923,10 @@ impl DataGeneral { success: false, message: "meta is updated again, cancel write".to_owned(), }) - .await{ - tracing::error!("Failed to send write one data response 3: {}", e); - } + .await + { + tracing::error!("Failed to send write one data response 3: {}", e); + } // .todo_handle("3 err_comment waitting to fill"); return; } @@ -836,14 +934,18 @@ impl DataGeneral { for data_with_idx in req.data.into_iter() { let proto::DataItemWithIdx { idx, data } = data_with_idx; let data = data.unwrap(); - let data_source = data.to_data_item_source(); - let data = Arc::new(data_source); - let serialize = data.as_ref().encode_persist(); + // let data_source = data.to_data_item_source(); + // let data = Arc::new(data_source); + let serialize = data.encode_persist(); tracing::debug!( - "writing data part uid({:?}) idx({}) item({})", + "writing data partial({:?}) uid({:?}) idx({}) type({:?}) at node({}) with mem size {}", + &serialize[0..30], req.unique_id, idx, - data.to_debug_string() + data.get_data_type(), + // data.to_debug_string(), + self.view.p2p().nodes_config.this_node(), + data.inmem_size() ); if let Err(err) = kv_store_engine.set( KeyTypeDataSetItem { @@ -855,19 +957,35 @@ impl DataGeneral { ) { tracing::warn!("flush error: {}", err) } + + #[cfg(test)] + { + // get data directly from kv store + let data = kv_store_engine.get( + &KeyTypeDataSetItem { + uid: req.unique_id.as_ref(), + idx: idx as u8, + }, + false, + KvAdditionalConf {}, + ); + + tracing::debug!("test read data partial({:?})", &data.unwrap().1[0..30]); + } } kv_store_engine.flush(); drop(guard); tracing::debug!("data partial is written"); - if let Err(e) = responsor //返回结果未使用 曾俊 + if let Err(e) = responsor //返回结果未使用 曾俊 .send_resp(WriteOneDataResponse { remote_version: req.version, success: true, message: "".to_owned(), }) - .await{ - tracing::error!("Failed to send write one data response 4: {}", e); - } + .await + { + tracing::error!("Failed to send write one data response 4: {}", e); + } // .todo_handle("4 err_comment waitting to fill"); } @@ -890,10 +1008,14 @@ impl DataGeneral { let key = KeyTypeDataSetMeta(&req.unique_id); let keybytes = key.make_key(); - + // test only log #[cfg(test)] - tracing::debug!("rpc_handle_data_meta_update {:?}\n {:?}", req,bincode::deserialize::(&req.serialized_meta)); + tracing::debug!( + "rpc_handle_data_meta_update {:?}\n {:?}", + req, + bincode::deserialize::(&req.serialized_meta) + ); // not test log #[cfg(not(test))] tracing::debug!("rpc_handle_data_meta_update {:?}", req); @@ -902,35 +1024,44 @@ impl DataGeneral { let _kv_write_lock_guard = kv_lock.write(); if let Some((_old_version, mut old_meta)) = - self.view.kv_store_engine().get(&key, true, KvAdditionalConf {}) + self.view + .kv_store_engine() + .get(&key, true, KvAdditionalConf {}) { if old_meta.version > req.version { drop(_kv_write_lock_guard); let err_msg = "New data version is smaller, failed update"; tracing::warn!("{}", err_msg); - if let Err(e) = responsor //返回结果未处理 曾俊 + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(proto::DataMetaUpdateResponse { version: old_meta.version, message: err_msg.to_owned(), }) - .await{ - tracing::error!("Failed to send data meta update response 5: {}", e); - } + .await + { + tracing::error!("Failed to send data meta update response 5: {}", e); + } // .todo_handle("5 err_comment waitting to fill"); return; } old_meta.version = req.version; if req.serialized_meta.len() > 0 { - if let Err(e) = self.view.kv_store_engine() //返回结果未处理 曾俊 - .set_raw(&keybytes, std::mem::take(&mut req.serialized_meta), true){ - tracing::error!("Failed to set raw data in kv store 6: {}", e); - } + if let Err(e) = self + .view + .kv_store_engine() //返回结果未处理 曾俊 + .set_raw(&keybytes, std::mem::take(&mut req.serialized_meta), true) + { + tracing::error!("Failed to set raw data in kv store 6: {}", e); + } // .todo_handle("6 err_comment waitting to fill"); } else { - if let Err(e) = self.view.kv_store_engine() //返回结果未处理 曾俊 - .set(key, &old_meta, true){ - tracing::error!("Failed to set raw data in kv store 7: {}", e); - } + if let Err(e) = self + .view + .kv_store_engine() //返回结果未处理 曾俊 + .set(key, &old_meta, true) + { + tracing::error!("Failed to set raw data in kv store 7: {}", e); + } // .todo_handle("7 err_comment waitting to fill"); } } else { @@ -939,37 +1070,42 @@ impl DataGeneral { "set new meta data, {:?}", bincode::deserialize::(&req.serialized_meta) ); - if let Err(e) = self.view.kv_store_engine() //返回结果未处理 曾俊 - .set_raw(&keybytes, std::mem::take(&mut req.serialized_meta), true){ - tracing::error!("Failed to set raw data in kv store 8: {}", e); - } + if let Err(e) = self + .view + .kv_store_engine() //返回结果未处理 曾俊 + .set_raw(&keybytes, std::mem::take(&mut req.serialized_meta), true) + { + tracing::error!("Failed to set raw data in kv store 8: {}", e); + } // .todo_handle("8 err_comment waitting to fill"); } else { drop(_kv_write_lock_guard); let err_msg = "Old meta data not found and missing new meta"; tracing::warn!("{}", err_msg); - if let Err(e) = responsor //返回结果未处理 曾俊 + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(proto::DataMetaUpdateResponse { version: 0, message: err_msg.to_owned(), }) - .await{ - tracing::error!("Failed to send data meta update response 9: {}", e); - } + .await + { + tracing::error!("Failed to send data meta update response 9: {}", e); + } // .todo_handle("9 err_comment waitting to fill"); return; } } drop(_kv_write_lock_guard); tracing::debug!("rpc_handle_data_meta_update success"); - if let Err(e) = responsor //返回结果未处理 曾俊 + if let Err(e) = responsor //返回结果未处理 曾俊 .send_resp(proto::DataMetaUpdateResponse { version: req.version, message: "Update success".to_owned(), }) - .await{ - tracing::error!("Failed to send data meta update response 10: {}", e); - } + .await + { + tracing::error!("Failed to send data meta update response 10: {}", e); + } // .todo_handle("10 err_comment waitting to fill"); } @@ -1004,7 +1140,8 @@ impl DataGeneral { tracing::debug!("starting rpc_handle_get_one_data {:?}", req); let kv_store_engine = self.view.kv_store_engine(); - let _ = self.view + let _ = self + .view .get_metadata(&req.unique_id, req.delete) .await .map_err(|err| { @@ -1017,6 +1154,7 @@ impl DataGeneral { for idx in req.idxs { let value = if req.delete { + tracing::debug!("deleting data item at idx: {}", idx); match kv_store_engine.del( KeyTypeDataSetItem { uid: req.unique_id.as_ref(), @@ -1031,6 +1169,7 @@ impl DataGeneral { } } } else { + tracing::debug!("getting data item at idx: {}", idx); kv_store_engine.get( &KeyTypeDataSetItem { uid: req.unique_id.as_ref(), @@ -1051,20 +1190,58 @@ impl DataGeneral { } msg }) - } else if got_or_deleted.iter().all(|v| v.is_some()) { - (true, "success".to_owned()) } else { - tracing::warn!("some data not found"); - (false, "some data not found".to_owned()) + let notfound_idxs = got_or_deleted + .iter() + .enumerate() + .filter(|(_, v)| v.is_none()) + .map(|(idx, _)| idx as u8) + .collect::>(); + if notfound_idxs.len() > 0 { + let msg = format!( + "some data not found on node({}), idxs: {:?},", + self.view.p2p().nodes_config.this_node(), + notfound_idxs + ); + tracing::warn!("{}", msg); + (false, msg) + } else { + (true, "success".to_owned()) + } }; + // if got_or_deleted.iter().all(|v| v.is_some()) { + // (true, "success".to_owned()) + // } else { + // tracing::warn!("some data not found"); + // (false, "some data not found".to_owned()) + // }; let mut got_or_deleted_checked: Vec = vec![]; if success { for v in got_or_deleted { + #[cfg(test)] + { + tracing::debug!( + "test get one data partial({:?})", + &v.clone().unwrap().1[0..30] + ); + } let decode_res = proto::DataItem::decode_persist(v.unwrap().1); match decode_res { Ok(item) => { - tracing::debug!("decoded data item: {:?}", item.to_string()); + tracing::debug!( + "decoded data item: {:?} with mem len {}", + item.to_string(), + item.inmem_size() + ); + + #[cfg(test)] + { + tracing::debug!( + "get one data decoded data item partial: {:?}", + &item.clone().into_data_bytes()[0..30] + ); + } got_or_deleted_checked.push(item); } Err(e) => { @@ -1088,114 +1265,6 @@ impl DataGeneral { Ok(()) } - // 处理批量数据写入请求 - pub async fn rpc_handle_batch_data( - &self, - responsor: RPCResponsor, - req: proto::BatchDataRequest, - ) -> WSResult<()> { - tracing::debug!("rpc_handle_batch_data with batchid({:?})", req.request_id.clone().unwrap()); - let batch_receive_states = self.batch_receive_states.clone(); - // 预先克隆闭包外需要的字段 - let block_index = req.block_index; - let data = req.data.clone(); - let request_id = req.request_id.clone().unwrap(); - - // 1. 查找或创建状态 - let state = match self.batch_receive_states - .get_or_init(req.request_id.clone().unwrap(), async move { - // 创建任务组和句柄 - let (mut group, handle) = match WriteSplitDataTaskGroup::new( - req.unique_id.clone(), - req.total_size as usize, - // req.block_type(), 类型错误 曾俊 - req.block_type.unwrap().data_item_dispatch.unwrap(), - req.version, - ).await { - Ok((group, handle)) => (group, handle), - Err(e) => { - tracing::error!("Failed to create task group: {:?}", e); - return Err(e); - } - }; - - // 再process之前订阅,避免通知先于订阅 - let waiter = handle.get_all_tasks_waiter(); - - // 启动process_tasks - let _ = tokio::spawn(async move { - match group.process_tasks().await { - Ok(item) => Ok(item), - Err(e) => { - tracing::error!("Failed to process tasks: {}", e); - Err(e) - } - } - }); - - let state = Arc::new(BatchReceiveState::new(handle, SharedWithBatchHandler::new())); - let state_clone = state.clone(); - - // response task - let _=tokio::spawn(async move { - tracing::debug!("rpc_handle_batch_data response task started"); - // 等待所有任务完成 - if let Err(e) = waiter.wait().await { - tracing::error!("Failed to wait for tasks: {}", e); - todo!("use responsor to send error response"); - return; - } - - tracing::debug!("rpc_handle_batch_data response task wait all tasks done"); - - // 发送最终响应 - if let Some(final_responsor) = state_clone.shared.get_final_responsor().await { - if let Err(e) = final_responsor.send_resp(proto::BatchDataResponse { - request_id: Some(req.request_id.clone().unwrap()), - success: true, - error_message: String::new(), - version: state_clone.handle.version(), - }).await { - tracing::error!("Failed to send final response: {}", e); - } - } - - // 清理状态 - let _=batch_receive_states.remove(&req.request_id.unwrap()); - }); - - Ok(state) - }) - .await { - Err(e) => return Err(WSError::WsDataError(WsDataError::BatchTransferError { - request_id, - msg: format!("Failed to initialize batch state: {}", e) - })), - Ok(state) => state, - }; - - tracing::debug!("rpc_handle_batch_data ready with write_split_data_task_group"); - - // 2. 提交分片数据 - let data_item = proto::DataItem { - data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(data)), - ..Default::default() - }; - - tracing::debug!("submit_split with data split idx: {}, at node: {}", block_index, self.view.p2p().nodes_config.this_node()); - state.handle.submit_split( - block_index as usize * DEFAULT_BLOCK_SIZE, - data_item, - ).await?; - - // 3. 更新响应器 - state.shared.update_responsor(responsor).await; - - Ok(()) - } - - - //费新文 // pub async fn distribute_task_to_worker( // &self, @@ -1250,9 +1319,9 @@ impl DataGeneral { // req: DistributeTaskReq, // ) { // tracing::debug!("rpc_handle_distribute_task with req({:?})", req); - + // // TODO: 这里需要实现具体的任务处理逻辑 - + // if let Err(e) = responsor // .send_resp(DistributeTaskResp { // success: true, @@ -1262,10 +1331,6 @@ impl DataGeneral { // tracing::error!("Failed to send distribute task response: {}", e); // } // } - - - - } #[derive(Serialize, Deserialize, Debug, Clone)] @@ -1304,7 +1369,7 @@ pub struct DataSetMetaV1 { /// 注意:新建元信息请使用 `DataSetMetaBuilder` /// /// https://fvd360f8oos.feishu.cn/docx/XoFudWhAgox84MxKC3ccP1TcnUh#share-Tqqkdxubpokwi5xREincb1sFnLc -#[derive(Serialize, Deserialize, Debug,Clone)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct DataSetMetaV2 { // unique_id: Vec, api_version: u8, @@ -1313,13 +1378,14 @@ pub struct DataSetMetaV2 { pub data_metas: Vec, pub synced_nodes: HashSet, pub cache_mode: Vec, + pub filepath: Vec>, } impl DataSetMetaV2 { pub fn cache_mode_visitor(&self, idx: DataItemIdx) -> CacheModeVisitor { CacheModeVisitor(self.cache_mode[idx as usize]) } - + pub fn data_item_cnt(&self) -> usize { self.datas_splits.len() } @@ -1343,7 +1409,7 @@ pub struct EachNodeSplit { pub node_id: NodeID, pub data_offset: u32, pub data_size: u32, - pub cache_mode: u32, // 添加 cache_mode 字段 + pub cache_mode: u32, // 添加 cache_mode 字段 } impl EachNodeSplit { @@ -1359,6 +1425,12 @@ pub struct DataSplit { pub splits: Vec, } +impl DataSplit { + pub fn total_size(&self) -> usize { + self.splits.iter().map(|s| s.data_size as usize).sum() + } +} + pub type DataSplitIdx = usize; // impl DataSplit { @@ -1456,7 +1528,7 @@ macro_rules! generate_cache_mode_methods { impl DataSetMetaBuilder { $( pub fn [](&mut self, idx: DataItemIdx) -> &mut Self { - self.assert_cache_mode_len(); + self.assert_len(); self.building.as_mut().unwrap().cache_mode[idx as usize] = (self.building.as_mut().unwrap().cache_mode[idx as usize] & ![]) | ([] & []); @@ -1494,7 +1566,7 @@ fn test_cache_mode_visitor() { // test builder - let meta = DataSetMetaBuilder::new() + let meta = DataSetMetaBuilder::new(vec![None]) .set_data_splits(vec![DataSplit { splits: vec![] }]) .cache_mode_map_file(0) .cache_mode_time_forever(0) @@ -1503,7 +1575,7 @@ fn test_cache_mode_visitor() { assert!(!meta.cache_mode_visitor(0).is_map_common_kv()); assert!(meta.cache_mode_visitor(0).is_time_forever()); assert!(!meta.cache_mode_visitor(0).is_time_auto()); - let meta = DataSetMetaBuilder::new() + let meta = DataSetMetaBuilder::new(vec![None]) .set_data_splits(vec![DataSplit { splits: vec![] }]) .cache_mode_map_common_kv(0) .cache_mode_time_forever(0) @@ -1523,7 +1595,7 @@ impl From for DataSetMetaBuilder { } } impl DataSetMetaBuilder { - pub fn new() -> Self { + pub fn new(filepath: Vec>) -> Self { Self { building: Some(DataSetMetaV2 { version: 0, @@ -1532,13 +1604,24 @@ impl DataSetMetaBuilder { api_version: 2, synced_nodes: HashSet::new(), cache_mode: vec![], + filepath: filepath, }), } } - fn assert_cache_mode_len(&self) { + // pub fn filepath(&mut self, ) -> &mut Self { + // self.building.as_mut().unwrap().filepath = filepath; + // self + // } + fn assert_len(&self) { if self.building.as_ref().unwrap().cache_mode.len() == 0 { panic!("please set_data_splits before set_cache_mode"); } + // filepath 和 cache_mode 长度必须相等 + if self.building.as_ref().unwrap().cache_mode.len() + != self.building.as_ref().unwrap().filepath.len() + { + panic!("cache_mode and filepath length must be equal"); + } } pub fn version(&mut self, version: u64) -> &mut Self { @@ -1587,6 +1670,11 @@ pub enum GetOrDelDataArgType { PartialOne { idx: DataItemIdx }, PartialMany { idxs: BTreeSet }, } +impl GetOrDelDataArgType { + pub fn is_delete(&self) -> bool { + matches!(self, Self::Delete) + } +} impl DataGeneralView { fn get_data_meta_local( @@ -1612,18 +1700,16 @@ impl DataGeneralView { Ok(meta_opt) } - pub async fn get_metadata( - &self, - unique_id: &[u8], - delete: bool, - ) -> WSResult { + pub async fn get_metadata(&self, unique_id: &[u8], delete: bool) -> WSResult { // 先尝试从本地获取 if let Some((_version, meta)) = self.get_data_meta_local(unique_id, delete)? { return Ok(meta); } // 本地不存在,从 master 获取 - self.data_general().get_or_del_datameta_from_master(unique_id, delete).await + self.data_general() + .get_or_del_datameta_from_master(unique_id, delete) + .await } } @@ -1633,6 +1719,8 @@ impl From for WSError { } } +const DATA_TMP_DIR: &str = "data_tmp"; + #[async_trait] impl LogicalModule for DataGeneral { fn inner_new(args: LogicalModuleNewArgs) -> Self @@ -1650,22 +1738,27 @@ impl LogicalModule for DataGeneral { // //费新文 // rpc_call_distribute_task: RPCCaller::new(), // rpc_handler_distribute_task: RPCHandler::new(), - - rpc_handler_write_once_data: RPCHandler::new(), rpc_handler_batch_data: RPCHandler::new(), rpc_handler_data_meta_update: RPCHandler::new(), rpc_handler_get_data_meta: RPCHandler::new(), rpc_handler_get_data: RPCHandler::new(), - + // 批量数据接收状态管理 batch_receive_states: AsyncInitMap::new(), + cache_in_memory: moka::sync::Cache::builder() + .max_capacity(1024 * 1024 * 4) + .time_to_live(Duration::from_secs(5 * 60)) + .weigher(|_, value: &Vec| value.len() as u32) + .build(), } } async fn start(&self) -> WSResult> { tracing::info!("start as master"); + fs::create_dir_all(self.view.os().file_path.join(DATA_TMP_DIR)).await?; + let p2p = self.view.p2p(); // register rpc callers { @@ -1675,14 +1768,12 @@ impl LogicalModule for DataGeneral { self.rpc_call_get_data_meta.regist(p2p); self.rpc_call_get_data.regist(p2p); - //费新文 // self.rpc_call_distribute_task.regist(p2p); } // register rpc handlers { - //费新文 // let view = self.view.clone(); // self.rpc_handler_distribute_task.regist( @@ -1696,14 +1787,15 @@ impl LogicalModule for DataGeneral { // Ok(()) // }, // ); - let view = self.view.clone(); self.rpc_handler_write_once_data .regist(p2p, move |responsor, req| { let view = view.clone(); let _ = tokio::spawn(async move { - view.data_general().rpc_handle_write_one_data(responsor, req).await; + view.data_general() + .rpc_handle_write_one_data(responsor, req) + .await; }); Ok(()) }); @@ -1715,7 +1807,10 @@ impl LogicalModule for DataGeneral { req: proto::BatchDataRequest| { let view = view.clone(); let _ = tokio::spawn(async move { - let _ = view.data_general().rpc_handle_batch_data(responsor, req).await; + let _ = view + .data_general() + .rpc_handle_batch_data(responsor, req) + .await; }); Ok(()) }, @@ -1728,7 +1823,9 @@ impl LogicalModule for DataGeneral { req: proto::DataMetaUpdateRequest| { let view = view.clone(); let _ = tokio::spawn(async move { - view.data_general().rpc_handle_data_meta_update(responsor, req).await + view.data_general() + .rpc_handle_data_meta_update(responsor, req) + .await }); Ok(()) }, @@ -1739,10 +1836,13 @@ impl LogicalModule for DataGeneral { .regist(p2p, move |responsor, req| { let view = view.clone(); let _ = tokio::spawn(async move { - if let Err(e) = view.data_general().rpc_handle_get_data_meta(req, responsor) //返回结果未处理 曾俊 - .await{ - tracing::error!("Failed to handle get data meta: {}", e); - } + if let Err(e) = view + .data_general() + .rpc_handle_get_data_meta(req, responsor) //返回结果未处理 曾俊 + .await + { + tracing::error!("Failed to handle get data meta: {}", e); + } // .todo_handle("rpc_handle_get_data_meta err"); }); Ok(()) @@ -1754,8 +1854,10 @@ impl LogicalModule for DataGeneral { move |responsor: RPCResponsor, req: proto::GetOneDataRequest| { let view = view.clone(); - let _ = tokio::spawn(async move { - view.data_general().rpc_handle_get_one_data(responsor, req).await + let _ = tokio::spawn(async move { + view.data_general() + .rpc_handle_get_one_data(responsor, req) + .await }); Ok(()) }, @@ -1767,4 +1869,4 @@ impl LogicalModule for DataGeneral { } #[derive(Debug, Clone, Copy)] -pub struct CacheModeVisitor(pub u16); \ No newline at end of file +pub struct CacheModeVisitor(pub u16); diff --git a/src/main/src/general/data/m_kv_store_engine.rs b/src/main/src/general/data/m_kv_store_engine.rs index b488f0a..643b8a0 100644 --- a/src/main/src/general/data/m_kv_store_engine.rs +++ b/src/main/src/general/data/m_kv_store_engine.rs @@ -569,10 +569,11 @@ mod test { let (_hold, _sys1, sys2) = test_utils::get_test_sys().await; let view = View::new(sys2); let key = "test_kv_store_engine_key"; - view.kv_store_engine() + let _ = view + .kv_store_engine() .set( KeyTypeDataSetMeta(key.as_bytes()), - &DataSetMetaBuilder::new() + &DataSetMetaBuilder::new(vec![None]) .cache_mode_map_common_kv(0) .cache_mode_pos_allnode(0) .cache_mode_time_auto(0) diff --git a/src/main/src/general/network/m_p2p.rs b/src/main/src/general/network/m_p2p.rs index 88675b8..ae6bbbe 100644 --- a/src/main/src/general/network/m_p2p.rs +++ b/src/main/src/general/network/m_p2p.rs @@ -415,18 +415,19 @@ impl P2PModule { let _ = self .waiting_tasks .insert((taskid, node_id), Some(tx).into()); - if let Err(e) = self.dispatch( //返回结果未处理 曾俊 + if let Err(e) = self.dispatch( + //返回结果未处理 曾俊 node_id, r.msg_id(), taskid, DispatchPayload::Local(Box::new(r)), - ){ + ) { tracing::error!("Failed to dispatch rpc: {}", e); } //.todo_handle(); //虞光勇修改,修改原因:在调用 todo_handle 方法时遇到了缺少参数的问题。需要确保在调用 todo_handle 方法时提供所需的字符串参数。 //修改内容:加入字符串参数。 - // .todo_handle("This part of the code needs to be implemented."); + // .todo_handle("This part of the code needs to be implemented."); let resp = rx.await.unwrap(); let resp = resp.downcast::().unwrap(); diff --git a/src/main/src/general/network/mod.rs b/src/main/src/general/network/mod.rs index 8f1ceff..7e5bfcf 100644 --- a/src/main/src/general/network/mod.rs +++ b/src/main/src/general/network/mod.rs @@ -6,6 +6,7 @@ pub mod proto_ext; pub mod rpc_model; pub mod proto { + pub mod kv { include!(concat!(env!("OUT_DIR"), "/kv.rs")); } @@ -13,9 +14,9 @@ pub mod proto { include!(concat!(env!("OUT_DIR"), "/raft.rs")); } - pub mod sche { - include!(concat!(env!("OUT_DIR"), "/sche.rs")); - } + // pub mod sche { + include!(concat!(env!("OUT_DIR"), "/proto.rs")); + // } pub mod metric { include!(concat!(env!("OUT_DIR"), "/metric.rs")); @@ -23,5 +24,5 @@ pub mod proto { pub mod remote_sys { include!(concat!(env!("OUT_DIR"), "/remote_sys.rs")); } - include!(concat!(env!("OUT_DIR"), "/data.rs")); + // include!(concat!(env!("OUT_DIR"), "/data.rs")); } diff --git a/src/main/src/general/network/msg_pack.rs b/src/main/src/general/network/msg_pack.rs index 90c4f82..3a2a433 100644 --- a/src/main/src/general/network/msg_pack.rs +++ b/src/main/src/general/network/msg_pack.rs @@ -61,8 +61,8 @@ define_msg_ids!( (proto::raft::VoteResponse, _pack, { true }), (proto::raft::AppendEntriesRequest, _pack, { true }), (proto::raft::AppendEntriesResponse, _pack, { true }), - (proto::sche::DistributeTaskReq, _pack, { true }), - (proto::sche::DistributeTaskResp, _pack, { true }), + (proto::DistributeTaskReq, _pack, { true }), + (proto::DistributeTaskResp, _pack, { true }), (proto::metric::RscMetric, _pack, { true }), (proto::kv::KvRequests, pack, { for r in &pack.requests { @@ -139,12 +139,20 @@ define_msg_ids!( // 2. unique_id 必须存在,标识数据集 // 3. data 必须存在,实际数据内容 let req = _pack; - match (req.request_id.is_some(), req.unique_id.is_empty(), req.data.is_empty()) { + match ( + req.request_id.is_some(), + req.unique_id.is_empty(), + req.data.is_empty(), + ) { (true, false, false) => true, - _ => false + _ => false, } }), - (proto::BatchDataResponse, _pack, { true }) + (proto::BatchDataResponse, _pack, { true }), + (proto::AddWaitTargetReq, _pack, { true }), + (proto::AddWaitTargetResp, _pack, { true }), + (proto::ListenForTaskDoneReq, _pack, { true }), + (proto::ListenForTaskDoneResp, _pack, { true }) ); pub trait RPCReq: MsgPack + Default { @@ -159,8 +167,8 @@ impl RPCReq for proto::raft::AppendEntriesRequest { type Resp = proto::raft::AppendEntriesResponse; } -impl RPCReq for proto::sche::DistributeTaskReq { - type Resp = proto::sche::DistributeTaskResp; +impl RPCReq for proto::DistributeTaskReq { + type Resp = proto::DistributeTaskResp; } impl RPCReq for proto::kv::KvRequests { @@ -203,6 +211,14 @@ impl RPCReq for proto::BatchDataRequest { type Resp = proto::BatchDataResponse; } +impl RPCReq for proto::AddWaitTargetReq { + type Resp = proto::AddWaitTargetResp; +} + +impl RPCReq for proto::ListenForTaskDoneReq { + type Resp = proto::ListenForTaskDoneResp; +} + // impl RPCReq for proto::kv::KvLockWaitAcquireNotifyRequest { // type Resp = proto::kv::KvLockWaitAcquireNotifyResponse; // } diff --git a/src/main/src/general/network/proto_ext.rs b/src/main/src/general/network/proto_ext.rs index ad1aafb..4609509 100644 --- a/src/main/src/general/network/proto_ext.rs +++ b/src/main/src/general/network/proto_ext.rs @@ -2,23 +2,33 @@ use crate::general::app::DataEventTrigger; use crate::general::data::m_data_general::dataitem::DataItemSource; use crate::general::data::m_data_general::DataItemIdx; use crate::general::data::m_dist_lock::DistLockOpe; -use crate::general::network::proto::sche::distribute_task_req::{ +use crate::general::network::proto::distribute_task_req::{ DataEventTriggerNew, DataEventTriggerWrite, Trigger, }; use super::proto::{self, kv::KvResponse, FileData}; -use std::{ops::Range, path::Path}; -use crate::result::{WSResult, WSError, WsDataError}; -use std::path::PathBuf; +use crate::result::{WSError, WSResult, WsDataError}; use std::fs::File; +use std::path::PathBuf; +use std::{ops::Range, path::Path}; use tokio; -use tokio::io::{AsyncSeekExt, AsyncReadExt}; +use tokio::io::{AsyncReadExt, AsyncSeekExt}; pub enum NewPartialFileDataArg { - FilePath{path: PathBuf, zip_path:Option}, - FileContent{path:PathBuf,content:Vec}, - File{path:PathBuf,file:File}, + FilePath { + basepath: PathBuf, + path: PathBuf, + zip_path: Option, + }, + FileContent { + path: PathBuf, + content: Vec, + }, + File { + path: PathBuf, + file: File, + }, } pub trait ProtoExtDataItem: Sized { @@ -28,8 +38,13 @@ pub trait ProtoExtDataItem: Sized { fn as_raw_bytes<'a>(&'a self) -> Option<&'a [u8]>; fn as_file_data(&self) -> Option<&proto::FileData>; fn to_data_item_source(&self) -> DataItemSource; - async fn new_partial_file_data(arg: NewPartialFileDataArg, range: Range) -> WSResult; + async fn new_partial_file_data( + arg: NewPartialFileDataArg, + range: Range, + ) -> WSResult; fn new_partial_raw_bytes(rawbytes: impl Into>, range: Range) -> WSResult; + fn new_file_data(path: &str, is_dir: bool) -> Self; + fn new_mem_data(mem: Vec) -> Self; } impl ProtoExtDataItem for proto::DataItem { @@ -41,71 +56,110 @@ impl ProtoExtDataItem for proto::DataItem { actual: bytes.len(), })); } - + Ok(Self { data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes( - bytes[range].to_vec() + bytes[range].to_vec(), )), }) } - async fn new_partial_file_data(arg: NewPartialFileDataArg, range: Range) -> WSResult { + fn new_file_data(path: &str, is_dir: bool) -> Self { + Self { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(proto::FileData { + file_name_opt: path.to_string(), + is_dir_opt: is_dir, + file_content: vec![], + })), + } + } + + fn new_mem_data(mem: Vec) -> Self { + Self { + data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(mem)), + } + } + + async fn new_partial_file_data( + arg: NewPartialFileDataArg, + range: Range, + ) -> WSResult { let mut file_data = proto::FileData::default(); - + // 从文件读取指定范围的数据 - async fn read_file_range(path: &Path, file: tokio::fs::File, range: Range) -> WSResult> { + async fn read_file_range( + path: &Path, + file: tokio::fs::File, + range: Range, + ) -> WSResult> { let mut file = tokio::io::BufReader::new(file); // file.seek(std::io::SeekFrom::Start(range.start as u64)) 曾俊 没对正常返回值做处理 - let _ = file.seek(std::io::SeekFrom::Start(range.start as u64)) + let _ = file + .seek(std::io::SeekFrom::Start(range.start as u64)) .await - .map_err(|e| WSError::WsDataError(WsDataError::FileSeekErr { - path: path.to_path_buf(), - err: e, - }))?; - + .map_err(|e| { + WSError::WsDataError(WsDataError::FileSeekErr { + path: path.to_path_buf(), + err: e, + }) + })?; + let mut buffer = vec![0; range.end - range.start]; // file.read_exact(&mut buffer) - let _ = file.read_exact(&mut buffer) - .await - .map_err(|e| WSError::WsDataError(WsDataError::FileReadErr { + let n = file.read_exact(&mut buffer).await.map_err(|e| { + WSError::WsDataError(WsDataError::FileReadErr { path: path.to_path_buf(), err: e, - }))?; - + }) + })?; + + assert_eq!(n, buffer.len(), "read file range len mismatch"); + + tracing::debug!( + "read file range len ({}), partial ({:?})", + buffer.len(), + &buffer[0..30] + ); + Ok(buffer) } - + match arg { - NewPartialFileDataArg::FilePath { path, zip_path } => { - file_data.file_name_opt = path.to_string_lossy().to_string(); + NewPartialFileDataArg::FilePath { + basepath, + path: path_, + zip_path, + } => { + let path = basepath.join(&path_); + file_data.file_name_opt = path_.to_string_lossy().to_string(); file_data.is_dir_opt = path.is_dir(); - + // 如果是目录,使用zip文件 let actual_path = if path.is_dir() { - zip_path.as_ref().ok_or_else(|| WSError::WsDataError( - WsDataError::BatchTransferFailed { + zip_path.as_ref().ok_or_else(|| { + WSError::WsDataError(WsDataError::BatchTransferFailed { // node: 0, // batch: 0, - request_id:proto::BatchRequestId { + request_id: proto::BatchRequestId { node_id: 0, sequence: 0, }, reason: "Directory must have zip_path".to_string(), - } - ))? + }) + })? } else { &path }; - - let file = tokio::fs::File::open(actual_path) - .await - .map_err(|e| WSError::WsDataError(WsDataError::FileOpenErr { + + let file = tokio::fs::File::open(actual_path).await.map_err(|e| { + WSError::WsDataError(WsDataError::FileOpenErr { path: actual_path.to_path_buf(), err: e, - }))?; - + }) + })?; + file_data.file_content = read_file_range(actual_path, file, range).await?; - }, + } NewPartialFileDataArg::FileContent { path, content } => { if range.end > content.len() { return Err(WSError::WsDataError(WsDataError::SizeMismatch { @@ -116,16 +170,16 @@ impl ProtoExtDataItem for proto::DataItem { file_data.file_name_opt = path.to_string_lossy().to_string(); file_data.is_dir_opt = path.is_dir(); file_data.file_content = content[range].to_vec(); - }, + } NewPartialFileDataArg::File { path, file } => { file_data.file_name_opt = path.to_string_lossy().to_string(); file_data.is_dir_opt = path.is_dir(); - + let file = tokio::fs::File::from_std(file); file_data.file_content = read_file_range(&path, file, range).await?; } } - + Ok(Self { data_item_dispatch: Some(proto::data_item::DataItemDispatch::File(file_data)), }) @@ -187,9 +241,7 @@ impl ProtoExtDataItem for proto::DataItem { Some(proto::data_item::DataItemDispatch::File(file_data)) => DataItemSource::File { path: file_data.file_name_opt.clone().into(), }, - _ => DataItemSource::Memory { - data: Vec::new(), - }, + _ => DataItemSource::Memory { data: Vec::new() }, } } } @@ -296,34 +348,65 @@ impl KvRequestExt for proto::kv::KvRequest { } } +// pub trait DataItemExt { +// fn decode_persist(data: Vec) -> WSResult +// where +// Self: Sized; +// fn encode_persist<'a>(&'a self) -> Vec; +// fn get_data_type(&self) -> proto::data_item::DataItemDispatch; +// fn into_data_bytes(self) -> Vec; +// } + pub trait DataItemExt { - fn decode_persist(data: Vec) -> WSResult where Self: Sized; + fn decode_persist(data: Vec) -> WSResult + where + Self: Sized; fn encode_persist<'a>(&'a self) -> Vec; + fn get_data_type(&self) -> proto::data_item::DataItemDispatch; + fn into_data_bytes(self) -> Vec; + fn inmem_size(&self) -> usize; } impl DataItemExt for proto::DataItem { - fn decode_persist(data: Vec) -> WSResult where Self: Sized { - if data.is_empty() { - return Err(WSError::WsDataError(WsDataError::DataDecodeError { - reason: "Empty data".to_string(), - data_type: "proto::DataItem".to_string(), - })); - } - let data_item_dispatch = match data[0] { + fn decode_persist(mut data: Vec) -> WSResult + where + Self: Sized, + { + // if data.is_empty() { + // return Err(WSError::WsDataError(WsDataError::DataDecodeError { + // reason: "Empty data".to_string(), + // data_type: "proto::DataItem".to_string(), + // })); + // } + let data_item_dispatch = match data[data.len() - 1] { 0 => { - let path_str = String::from_utf8(data[1..].to_vec()).map_err(|e| { - WSError::WsDataError(WsDataError::DataDecodeError { - reason: format!("Failed to decode path string: {}", e), - data_type: "proto::DataItem::File".to_string(), - }) - })?; + // filename length + let is_dir = data[data.len() - 2] == 1; + let filename_len = data[data.len() - 3] as usize; + let filename = + String::from_utf8(data[data.len() - 3 - filename_len..data.len() - 3].to_vec()) + .map_err(|e| { + WSError::WsDataError(WsDataError::DataDecodeError { + reason: format!("Failed to decode path string: {}", e), + data_type: "proto::DataItem::File".to_string(), + }) + })?; + // data_type: "proto::DataItem::File".to_string(), + // }) + // })?; + // proto::data_item::DataItemDispatch::File(FileData { + // file_name_opt: path_str, + // is_dir_opt: false, + // file_content: Vec::new(), + // }) + data.truncate(data.len() - 3 - filename_len); proto::data_item::DataItemDispatch::File(FileData { - file_name_opt: path_str, - is_dir_opt: false, - file_content: Vec::new(), + file_name_opt: filename, + is_dir_opt: is_dir, // ignore is_dir + file_content: data, // [0..data.len() - 3 - filename_len].to_vec(), }) - }, - 1 => proto::data_item::DataItemDispatch::RawBytes(data[1..].to_vec()), + } + 1 => proto::data_item::DataItemDispatch::RawBytes(data[0..data.len() - 1].to_vec()), _ => { return Err(WSError::WsDataError(WsDataError::DataDecodeError { reason: format!("Unknown data item type id: {}", data[0]), @@ -338,17 +421,58 @@ impl DataItemExt for proto::DataItem { fn encode_persist<'a>(&'a self) -> Vec { match self.data_item_dispatch.as_ref().unwrap() { proto::data_item::DataItemDispatch::File(f) => { - let mut ret = vec![0]; + let mut ret = vec![]; + tracing::debug!("encoding file data, size: {}", f.file_content.len()); ret.extend_from_slice(&f.file_content); + // append file name on the end of the file content + ret.extend_from_slice(f.file_name_opt.as_bytes()); + // append file name length on the end of the file content + ret.push(f.file_name_opt.len() as u8); + // append is dir on the end of the file content + ret.push(if f.is_dir_opt { 1 } else { 0 }); + // append data item type id on the end of the file content + ret.push(0); ret } proto::data_item::DataItemDispatch::RawBytes(bytes) => { - let mut ret = vec![1]; + let mut ret = vec![]; + tracing::debug!("encoding raw bytes data, size: {}", bytes.len()); ret.extend_from_slice(bytes); + // append data item type id on the end of the file content + ret.push(1); ret } } } + fn get_data_type(&self) -> proto::data_item::DataItemDispatch { + match &self.data_item_dispatch.as_ref().unwrap() { + proto::data_item::DataItemDispatch::File(d) => { + proto::data_item::DataItemDispatch::File(FileData { + file_name_opt: d.file_name_opt.clone(), + is_dir_opt: d.is_dir_opt, + file_content: Vec::new(), + }) + } + proto::data_item::DataItemDispatch::RawBytes(_) => { + proto::data_item::DataItemDispatch::RawBytes(Vec::new()) + } + } + } + + fn into_data_bytes(mut self) -> Vec { + match self.data_item_dispatch.take() { + Some(proto::data_item::DataItemDispatch::File(f)) => f.file_content, + Some(proto::data_item::DataItemDispatch::RawBytes(bytes)) => bytes, + None => panic!("DataItem is empty"), + } + } + + fn inmem_size(&self) -> usize { + match &self.data_item_dispatch.as_ref().unwrap() { + proto::data_item::DataItemDispatch::File(f) => f.file_content.len(), + proto::data_item::DataItemDispatch::RawBytes(bytes) => bytes.len(), + } + } } pub trait ProtoExtDataEventTrigger { @@ -370,12 +494,19 @@ impl ProtoExtDataEventTrigger for DataEventTrigger { pub trait ProtoExtDataScheduleContext { fn dataitem_cnt(&self) -> DataItemIdx; + fn filepath(&self) -> Vec>; } impl ProtoExtDataScheduleContext for proto::DataScheduleContext { fn dataitem_cnt(&self) -> DataItemIdx { self.each_data_sz_bytes.len() as DataItemIdx } + fn filepath(&self) -> Vec> { + self.filepaths + .iter() + .map(|p| if p.is_empty() { None } else { Some(p.clone()) }) + .collect() + } } // Example usage in tests diff --git a/src/main/src/general/network/proto_src/data.proto b/src/main/src/general/network/proto_src/data.proto index e585b0d..a4b216b 100644 --- a/src/main/src/general/network/proto_src/data.proto +++ b/src/main/src/general/network/proto_src/data.proto @@ -1,5 +1,7 @@ syntax = "proto3"; -package data; +package proto; + +import "general/network/proto_src/sche.proto"; enum DataOpeType{ Read = 0; @@ -46,6 +48,8 @@ message DataScheduleContext{ // required DataOpeRoleFuncCall func_call = 5; } + repeated string filepaths=6; + proto.FnTaskId src_task_id=7; } message EachNodeSplit{ diff --git a/src/main/src/general/network/proto_src/sche.proto b/src/main/src/general/network/proto_src/sche.proto index fdec748..6d616b2 100644 --- a/src/main/src/general/network/proto_src/sche.proto +++ b/src/main/src/general/network/proto_src/sche.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package sche; +package proto; // import "network/proto_src/kv.proto"; @@ -35,17 +35,43 @@ message DistributeTaskReq { string app = 1; string func = 2; - uint32 task_id = 3; + FnTaskId task_id = 3; + FnTaskId trigger_src_task_id = 4; oneof trigger { - DataEventTriggerWrite event_write = 4; // For Write/WriteWithCondition - DataEventTriggerNew event_new = 5; // For New/NewWithCondition + DataEventTriggerWrite event_write = 5; // For Write/WriteWithCondition + DataEventTriggerNew event_new = 6; // For New/NewWithCondition } } +message FnTaskId{ + uint32 call_node_id=1; + uint32 task_id=2; +} + message DistributeTaskResp { bool success = 1; string err_msg = 2; } +// after master scheduled +// notify the src node task to wait for the target node task +message AddWaitTargetReq{ + uint32 src_task_id=1; + uint32 task_run_node=2; + FnTaskId sub_task_id=3; +} + +message AddWaitTargetResp{ + bool success=1; + string err_msg=2; +} + +message ListenForTaskDoneReq{ + FnTaskId task_id=1; +} +message ListenForTaskDoneResp{ + bool success=1; + string err_msg=2; +} \ No newline at end of file diff --git a/src/main/src/general/network/rpc_model.rs b/src/main/src/general/network/rpc_model.rs index 2d9776d..3b7cd80 100644 --- a/src/main/src/general/network/rpc_model.rs +++ b/src/main/src/general/network/rpc_model.rs @@ -18,21 +18,21 @@ use crate::result::{WSResult, WsFuncError, WsRpcErr}; // start from the begining #[async_trait] -pub trait RpcCustom: Sized + 'static { +pub trait RpcCustom: Clone + Sized + Send + 'static { type SpawnArgs: Send + 'static; fn bind(a: Self::SpawnArgs) -> UnixListener; // return true if the id matches remote call pack fn handle_remote_call(conn: &HashValue, id: u8, buf: &[u8]) -> bool; - async fn verify(buf: &[u8]) -> Option; + async fn verify(&self, buf: &[u8]) -> Option; // fn deserialize(id: u16, buf: &[u8]); } -pub fn spawn(a: R::SpawnArgs) -> tokio::task::JoinHandle<()> { - tokio::spawn(accept_task::(a)) +pub fn spawn(r: R, a: R::SpawnArgs) -> tokio::task::JoinHandle<()> { + tokio::spawn(accept_task::(r, a)) } -async fn accept_task(a: R::SpawnArgs) { +async fn accept_task(r: R, a: R::SpawnArgs) { // std::fs::remove_file(AGENT_SOCK_PATH).unwrap(); // clean_sock_file(AGENT_SOCK_PATH); @@ -41,7 +41,8 @@ async fn accept_task(a: R::SpawnArgs) { let listener = R::bind(a); loop { let (socket, _) = listener.accept().await.unwrap(); - let _ = tokio::spawn(listen_task::(socket)); + let r = r.clone(); + let _ = tokio::spawn(async move { listen_task::(r, socket).await }); } } @@ -142,14 +143,14 @@ lazy_static! { static ref NEXT_TASK_ID: AtomicU32 = AtomicU32::new(0); } -async fn listen_task(socket: tokio::net::UnixStream) -> WSResult<()> { +async fn listen_task(r: R, socket: tokio::net::UnixStream) -> WSResult<()> { tracing::debug!("new connection: {:?}", socket.peer_addr().unwrap()); let (mut sockrx, socktx) = socket.into_split(); let mut buf = [0; 1024]; let mut len = 0; let (conn, rx) = - match listen_task_ext::verify_remote::(&mut sockrx, &mut len, &mut buf).await { + match listen_task_ext::verify_remote::(r, &mut sockrx, &mut len, &mut buf).await { Ok((conn, rx)) => (conn, rx), Err(err) => { tracing::debug!("verify failed {:?}", err); @@ -182,11 +183,13 @@ pub(super) mod listen_task_ext { use super::{HashValue, RpcCustom, CALL_MAP, CONN_MAP}; pub(super) async fn verify_remote( + r: R, sockrx: &mut OwnedReadHalf, len: &mut usize, buf: &mut [u8], ) -> WSResult<(HashValue, Receiver>)> { async fn verify_remote_inner( + r: R, sockrx: &mut OwnedReadHalf, len: &mut usize, buf: &mut [u8], @@ -212,7 +215,7 @@ pub(super) mod listen_task_ext { } // println!("wait done"); - let Some(id) = R::verify(&buf[4..4 + verify_msg_len]).await else { + let Some(id) = r.verify(&buf[4..4 + verify_msg_len]).await else { tracing::warn!("verify failed"); return Err(WsFuncError::InsranceVerifyFailed("verify failed".to_string()).into()); }; @@ -232,7 +235,7 @@ pub(super) mod listen_task_ext { } match tokio::time::timeout( Duration::from_secs(5), - verify_remote_inner::(sockrx, len, buf), + verify_remote_inner::(r, sockrx, len, buf), ) .await { @@ -281,7 +284,7 @@ pub(super) mod listen_task_ext { } if !R::handle_remote_call(&conn, msg_id, &buf[..msg_len]) { - // call back + tracing::debug!("msg id not remote call to sys, seen as sys call response"); let Some(cb) = CALL_MAP.write().remove(&taskid) else { tracing::warn!( "rd stream is not in correct format, taskid:{} msgid:{}", diff --git a/src/main/src/general/test_utils.rs b/src/main/src/general/test_utils.rs index 37e8c08..088c07e 100644 --- a/src/main/src/general/test_utils.rs +++ b/src/main/src/general/test_utils.rs @@ -14,6 +14,9 @@ lazy_static! { Mutex::new(None); } +pub const TEST_SYS1_PORT: u16 = 2303; +pub const TEST_SYS2_PORT: u16 = 2307; + /// sys1 is the master, sys2 is the worker pub async fn get_test_sys<'a>() -> ( tokio::sync::MutexGuard< @@ -39,20 +42,22 @@ async fn start_2_node() -> ((Sys, LogicalModulesRef), (Sys, LogicalModulesRef)) let _ = fs::remove_dir_all("test_temp_dir1"); let _ = fs::remove_dir_all("test_temp_dir2"); - let node0: NodeConfig = serde_yaml::from_str( + let node0: NodeConfig = serde_yaml::from_str(&format!( r#" -addr: 127.0.0.1:2303 +addr: 127.0.0.1:{} spec: [meta,master] "#, - ) + TEST_SYS1_PORT + )) .unwrap(); - let node1: NodeConfig = serde_yaml::from_str( + let node1: NodeConfig = serde_yaml::from_str(&format!( r#" -addr: 127.0.0.1:2307 +addr: 127.0.0.1:{} spec: [meta,worker] "#, - ) + TEST_SYS2_PORT + )) .unwrap(); let sys1 = Sys::new(NodesConfig { diff --git a/src/main/src/master/app/fddg.rs b/src/main/src/master/app/fddg.rs index 31ce5ee..a0245f5 100644 --- a/src/main/src/master/app/fddg.rs +++ b/src/main/src/master/app/fddg.rs @@ -1,9 +1,7 @@ use crate::new_map; use crate::util::container::sync_trie::SyncedTrie; use crate::{ - general::{ - app::{AppType, FnMeta}, - }, + general::app::{AppType, FnMeta}, result::WSResult, }; use std::collections::HashMap; @@ -34,19 +32,39 @@ impl FDDGMgmt { // return app_name -> (apptype, fn_name -> fn_meta) pub fn get_binded_funcs( &self, - _data_unique_id: &str, + data_unique_id: &str, _ope: FuncTriggerType, ) -> HashMap)> { let mut binded_funcs = HashMap::new(); - let binded_matchers = self.prefix_key_to_functions.match_partial(_data_unique_id); + let binded_matchers = self.prefix_key_to_functions.match_partial(data_unique_id); for matcher in binded_matchers { + tracing::debug!("match fddg data prefix len {:?}", matcher.0); let node = matcher.1.read(); - for (app_name, (app_type, _fn_names)) in node.iter() { - let _ = binded_funcs - .entry(app_name.to_string()) - .or_insert((*app_type, HashMap::new())); + for (app_name, (app_type, fn_names)) in node.iter() { + for (fn_name, fn_meta) in fn_names.iter() { + let _ = binded_funcs + .entry(app_name.to_string()) + .and_modify(|(_, fn_names): &mut (AppType, HashMap)| { + let _ = fn_names.insert(fn_name.to_string(), fn_meta.clone()); + }) + .or_insert_with(|| { + ( + app_type.clone(), + new_map! (HashMap { + fn_name.to_string() => fn_meta.clone(), + }), + ) + }); + } } + // let _ = binded_funcs + // .entry(app_name.to_string()) + // .and_modify(|(_, fn_names)| { + + // }) + // .or_insert((*app_type, HashMap::new())); } + tracing::debug!("binded_funcs get: {:?}", binded_funcs); binded_funcs } diff --git a/src/main/src/master/app/mod.rs b/src/main/src/master/app/mod.rs index 0a559e8..33cf6b9 100644 --- a/src/main/src/master/app/mod.rs +++ b/src/main/src/master/app/mod.rs @@ -1,2 +1,5 @@ pub mod fddg; pub mod m_app_master; + +#[cfg(test)] +mod test; \ No newline at end of file diff --git a/src/main/src/master/app/test.rs b/src/main/src/master/app/test.rs index e69de29..b447c7f 100644 --- a/src/main/src/master/app/test.rs +++ b/src/main/src/master/app/test.rs @@ -0,0 +1,214 @@ +use crate::{ + config::{NodeConfig, NodesConfig}, + general::app::View, + util::command::CommandDebugStdio, +}; +use axum::body::Bytes; +use core::panic; +use path_absolutize::Absolutize; +use reqwest; +use serde_json; +use tokio::process::Command; +// use std::process::{Command, Stdio}; +use std::{collections::HashMap, env, fs, path::PathBuf, process::Stdio}; + +// #[cfg(test)] +use crate::general::test_utils; + +#[tokio::test(flavor = "multi_thread")] +async fn test_app_upload() -> Result<(), Box> { + // install java related by scripts/install/2.3install_java_related.py + // run real time output command + let (stdout_task, stderr_task, mut child) = Command::new("bash") + .arg("-c") + .arg("python3 scripts/install/2.3install_java_related.py") + .current_dir("../../../../../middlewares/waverless/waverless") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn_debug() + .await + .unwrap(); + let status = child.wait().await.unwrap(); + if !status.success() { + panic!( + "install java related failed, stderr: {}, stdout: {}", + stderr_task.await.unwrap(), + stdout_task.await.unwrap() + ); + } + + // 使用 get_test_sys 新建两个系统模块(一个 master,一个 worker) + let ( + _sys_guard, // 互斥锁守卫 + _master_logical_modules, // 系统 0 (Master) 的逻辑模块引用 + worker_logical_modules, // 系统 1 (Worker) 的逻辑模块引用 + ) = test_utils::get_test_sys().await; + + // 延迟等待连接稳定 + tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + + //调用 bencher 的 prepare 模式触发应用上传 + tracing::debug!("test_app_upload uploading app"); + + // 创建临时配置文件 + let temp_dir = tempfile::tempdir().expect("Failed to create temp directory"); + let config_path = temp_dir.path().join("cluster_config.yml"); + + // 写入配置内容 + let config_content = format!( + r#" +master: + ip: 127.0.0.1:{} + is_master: +worker: + ip: 127.0.0.1:{} +"#, + test_utils::TEST_SYS1_PORT + 1, + test_utils::TEST_SYS2_PORT + 1, + ); + std::fs::write(&config_path, config_content).expect("Failed to write config file"); + + //sleep 10s + tracing::debug!("test_app_upload sleep 4s for system to be ready"); + tokio::time::sleep(tokio::time::Duration::from_secs(10)).await; + + // 获取配置文件的绝对路径 + let config_path_str = config_path.to_str().expect("Invalid path"); + + let command_str = format!( + "echo $PWD && \ + cargo run -- \ + simple_demo/simple \ + --with-wl \ + --prepare \ + --config {}", + config_path_str + ); + + let (stdout_task, stderr_task, mut child) = Command::new("bash") + .arg("-c") + .arg(command_str) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .current_dir("../../../../../bencher") // 设置工作目录 + .spawn_debug() + .await + .unwrap_or_else(|err| { + // 确保路径是绝对路径 + let absolute_path = env::current_dir(); + + panic!("Command failed to execute: {:?} {:?}", absolute_path, err,) + }); + + let status = child.wait().await.unwrap(); + if !status.success() { + panic!( + "Command failed to execute: {:?}\nstdout: {}\nstderr: {}", + status, + stdout_task.await.unwrap(), + stderr_task.await.unwrap() + ); + } + + tracing::debug!( + "test_app_upload app uploaded", + // stdout_task.await.unwrap() + ); + + // // 增加延迟等待上传完成 + // tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + + // 应用名称 + let appname = "simple_demo"; + // 读取本地 ZIP 文件的内容 + let zip_path = format!("../../../../../middlewares/waverless/{}.zip", appname); + let zip_content = tokio::fs::read(zip_path) + .await /* */ + .unwrap_or_else(|err| { + panic!( + "test read app zip failed {:?}, current path is {:?}", + err, + std::env::current_dir().unwrap().absolutize().unwrap() + ) + }); + + let zip_bytes = Bytes::from(zip_content); + + // 获取当前视图的逻辑 + let view2 = View::new(worker_logical_modules.clone()); // 直接使用 master_logical_modules + let app_meta_manager2 = view2.appmeta_manager(); + + // 通过状态标志位校验应用上传 http 接口是否正常 + let test_http_app_uploaded = { + let test_http_app_uploaded_guard = app_meta_manager2.test_http_app_uploaded.lock(); + test_http_app_uploaded_guard.clone() + }; + // 检查标志位是否为空 + if test_http_app_uploaded.is_empty() { + panic!("应用上传失败:未接收到上传数据"); + } + + tracing::debug!("test_app_upload verifying app uploaded bytes"); + assert!(test_http_app_uploaded == zip_bytes, "应用上传失败"); + + // 调用数据接口校验应用是否上传完成 + tracing::debug!("test_app_upload verifying app meta"); + let app_meta = app_meta_manager2.get_app_meta("simple_demo").await; + assert!(app_meta.is_ok(), "Failed to get app meta"); + let app_meta = app_meta.unwrap(); + assert!(app_meta.is_some(), "App meta data not found"); + + // wait for checkpoint + tracing::debug!("test_app_upload wait 10s for checkpoint"); + for i in 0..10 { + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + tracing::debug!("test_app_upload waited {}s", i + 1); + } + // tokio::time::sleep(tokio::time::Duration::from_secs(10)).await; + + // 发起对函数的 http 请求校验应用是否运行 + tracing::debug!("test_app_upload try calling test app"); + let client = reqwest::Client::new(); + let response = client + .post(&format!( + "http://localhost:{}/simple_demo/simple", + test_utils::TEST_SYS1_PORT + 1 + )) + .body("{}") + .send() + .await + .expect("Failed to send HTTP request"); + + let status = response.status().as_u16(); + + let resptext = response + .text() + .await + .unwrap_or_else(|err| panic!("receive bytes failed with error {}", err)); + // let respmaybestr = std::str::from_utf8(&respbytes); + tracing::debug!("test_app_upload call app resp with {} {}", status, resptext); + // 验证响应状态码 + + if status != 200 { + panic!("call application failed"); + } + // 解析响应 + // let response_text = response.text().await.expect("Failed to read response text"); + let res: serde_json::Value = + serde_json::from_str(&resptext).expect("Failed to parse response as JSON"); + + // 验证响应中包含必要的时间戳字段 + assert!( + res.get("req_arrive_time").is_some(), + "Missing req_arrive_time" + ); + assert!(res.get("bf_exec_time").is_some(), "Missing bf_exec_time"); + assert!( + res.get("recover_begin_time").is_some(), + "Missing recover_begin_time" + ); + assert!(res.get("fn_start_time").is_some(), "Missing fn_start_time"); + assert!(res.get("fn_end_time").is_some(), "Missing fn_end_time"); + + Ok(()) // 返回 Ok(()) 表示成功 +} diff --git a/src/main/src/master/data/m_data_master.rs b/src/main/src/master/data/m_data_master.rs index 60f5d63..711f0c9 100644 --- a/src/main/src/master/data/m_data_master.rs +++ b/src/main/src/master/data/m_data_master.rs @@ -1,4 +1,5 @@ use crate::general::app::m_executor::Executor; +use crate::general::app::AffinityPattern; use crate::general::app::AppMetaManager; use crate::general::app::DataEventTrigger; use crate::general::data::m_data_general::CacheModeVisitor; @@ -6,6 +7,7 @@ use crate::general::network::m_p2p::{P2PModule, RPCCaller, RPCHandler, RPCRespon use crate::general::network::proto::{ self, DataVersionScheduleRequest, DataVersionScheduleResponse, }; +use crate::general::network::proto_ext::ProtoExtDataScheduleContext; use crate::master::m_master::{FunctionTriggerContext, Master}; use crate::result::{WSResult, WSResultExt}; use crate::sys::{LogicalModulesRef, NodeID}; @@ -13,8 +15,8 @@ use crate::util::JoinHandleWrapper; use crate::{ general::data::{ m_data_general::{ - CacheMode, DataGeneral, DataSetMetaBuilder, DataSplit, - EachNodeSplit, CACHE_MODE_MAP_COMMON_KV_MASK, CACHE_MODE_TIME_FOREVER_MASK, + CacheMode, DataGeneral, DataSetMetaBuilder, DataSplit, EachNodeSplit, + CACHE_MODE_MAP_COMMON_KV_MASK, CACHE_MODE_TIME_FOREVER_MASK, }, m_kv_store_engine::{KeyType, KeyTypeDataSetMeta, KvAdditionalConf, KvStoreEngine}, }, @@ -48,7 +50,6 @@ pub struct DataMaster { rpc_handler: RPCHandler, rpc_caller_data_meta_update: RPCCaller, } - #[async_trait] impl LogicalModule for DataMaster { fn inner_new(args: LogicalModuleNewArgs) -> Self @@ -59,6 +60,7 @@ impl LogicalModule for DataMaster { rpc_handler: RPCHandler::new(), view: DataMasterView::new(args.logical_modules_ref.clone()), rpc_caller_data_meta_update: RPCCaller::new(), + // rpc_caller_add_wait_target: RPCCaller::new(), // view: DataMasterView::new(args.logical_modules_ref.clone()), } } @@ -70,6 +72,7 @@ impl LogicalModule for DataMaster { tracing::info!("start as master"); let view = self.view.clone(); let _ = self.rpc_caller_data_meta_update.regist(view.p2p()); + // let _ = self.rpc_caller_add_wait_target.regist(view.p2p()); let _ = self .rpc_handler .regist(self.view.p2p(), move |responsor, req| { @@ -96,7 +99,15 @@ impl DataMaster { // 如果不是有效的 UTF-8 字符串,直接返回空结果 let data_unique_id_str = match std::str::from_utf8(data_unique_id) { Ok(s) => s, - Err(_) => return Ok((DataSetMetaBuilder::new().build().cache_mode, vec![], vec![])), + Err(_) => { + return Ok(( + DataSetMetaBuilder::new(context.filepath()) + .build() + .cache_mode, + vec![], + vec![], + )) + } }; // 获取绑定的函数 @@ -107,37 +118,101 @@ impl DataMaster { .get_binded_funcs(data_unique_id_str, func_trigger_type); // 收集所有调度节点作为缓存节点 - let mut cache_nodes = HashSet::new(); + let cache_nodes = HashSet::new(); // 对每个绑定的函数进行调度 for (app_name, (_, fn_names)) in &binded_funcs { - for (fn_name, _unused) in fn_names { + for (fn_name, fnmeta) in fn_names { + let target_nodes: Vec = if let Some(affinity) = fnmeta.affinity.clone() { + match affinity.nodes { + AffinityPattern::All => self + .view + .p2p() + .nodes_config + .all_nodes_iter() + .filter(|(id, _)| { + **id != self.view.p2p().nodes_config.get_master_node() + }) + .map(|(id, _)| *id) + .collect(), + AffinityPattern::List(nodes) => nodes, + AffinityPattern::NodeCount(_) => { + todo!() + // self.view + // .p2p() + // .nodes_config + // .all_nodes_iter() + // .filter(|(id, _)| id != self.view.p2p().nodes_config.get_master_node()) + } + } + } else { + vec![self.view.master().select_node()] + }; // 选择调度节点 (暂时不考虑亲和性规则) - let target_node = self.view.master().select_node(); + // let target_node = ; // 将调度节点加入缓存节点集合 - let _ = cache_nodes.insert(target_node); + // let _ = cache_nodes.insert(target_node); + + // 发送触发请求并处理可能的错误 + tracing::debug!( + "data {:?} write trigger function {}/{} on nodes {:?}", + data_unique_id, + app_name, + fn_name, + &target_nodes + ); // 创建函数触发上下文 let ctx = FunctionTriggerContext { app_name: app_name.clone(), fn_name: fn_name.clone(), data_unique_id: data_unique_id.to_vec(), - target_nodes: vec![target_node], // 只在选中的节点上触发 + target_nodes: target_nodes, // 只在选中的节点上触发 timeout: Duration::from_secs(60), event_type: DataEventTrigger::Write, // 使用Write事件类型 + src_task_id: context.src_task_id.clone().unwrap(), }; - // 发送触发请求并处理可能的错误 - if let Err(e) = self.view.master().trigger_func_call(ctx).await { - tracing::error!( - "Failed to trigger function {}/{} on node {}: {:?}", - app_name, - fn_name, - target_node, - e - ); - } + // async call with unique task, don't block current task + let view = self.view.clone(); + let app_name = app_name.clone(); + let fn_name = fn_name.clone(); + let _ = tokio::spawn(async move { + if let Err(e) = view.master().trigger_func_call(ctx).await { + tracing::error!( + "Failed to trigger function {}/{}: {:?}", + app_name, + fn_name, + e + ); + } + }); + // if let Err(e) = self + // .rpc_caller_add_wait_target + // .call( + // self.view.p2p(), + // context.src_node_id, + // proto::AddWaitTargetReq { + // task_id: context.src_task_id, + // }, + // Some(Duration::from_secs(60)), + // ) + // .await + // { + // tracing::error!( + // "Failed to add wait target for data({:?}) on node {}: {:?}", + // data_unique_id, + // context.src_node_id, + // e + // ); + // } else { + // tracing::debug!( + // "add wait target for data({:?}) on node {} success", + // data_unique_id, + // context.src_node_id + // ); + // } } } @@ -187,16 +262,22 @@ impl DataMaster { } // 设置缓存模式 - let mut builder = DataSetMetaBuilder::new(); + let mut builder = DataSetMetaBuilder::new(context.filepath()); // 设置数据分片 let _ = builder.set_data_splits(splits.clone()); - // 暂时用zui'lzuil + // 暂时用zui'lzuil for idx in 0..splits.len() { - let _= builder.cache_mode_time_auto(idx as u8).cache_mode_pos_auto(idx as u8); + let _ = builder + .cache_mode_time_auto(idx as u8) + .cache_mode_pos_auto(idx as u8); } - let cache_modes=builder.build().cache_mode; - tracing::debug!("planned for write data({:?}) cache_modes: {:?}", data_unique_id, cache_modes); + let cache_modes = builder.build().cache_mode; + tracing::debug!( + "planned for write data({:?}) cache_modes: {:?}", + data_unique_id, + cache_modes + ); Ok((cache_modes, splits, cache_nodes)) } @@ -244,7 +325,7 @@ impl DataMaster { builder.build() } else { tracing::debug!("new dataset meta for data({:?})", req.unique_id); - let mut builder = DataSetMetaBuilder::new(); + let mut builder = DataSetMetaBuilder::new(ctx.filepath()); // version let _ = builder.version(1); // data splits bf cache mod @@ -267,7 +348,11 @@ impl DataMaster { // update version peers { - tracing::debug!("updating meta({:?}) to peers for data({:?})", new_meta, req.unique_id); + tracing::debug!( + "updating meta({:?}) to peers for data({:?})", + new_meta, + req.unique_id + ); let need_notify_nodes = { let mut need_notify_nodes = HashSet::new(); for one_data_splits in &new_meta.datas_splits { @@ -327,7 +412,6 @@ impl DataMaster { }); } } - tracing::debug!( "data:{:?} version required({}) and schedule done, caller will do following thing after receive `DataVersionScheduleResponse`", @@ -347,9 +431,10 @@ impl DataMaster { .collect(), cache_nodes, }) - .await{ - tracing::error!("Failed to send data version schedule response: {}", e); - } + .await + { + tracing::error!("Failed to send data version schedule response: {}", e); + } // .todo_handle("This part of the code needs to be implemented."); Ok(()) } diff --git a/src/main/src/master/m_http_handler.rs b/src/main/src/master/m_http_handler.rs index 2d6f528..f6e5441 100644 --- a/src/main/src/master/m_http_handler.rs +++ b/src/main/src/master/m_http_handler.rs @@ -105,19 +105,19 @@ impl HttpHandler for MasterHttpHandler { return self.handle_prometheus(); } - let view = self.view.clone(); - if !view.p2p().nodes_config.this.1.is_master() { - tracing::debug!("this is_master"); - match self.view.appmeta_manager().app_available(app).await { - Ok(true) => {} - Ok(false) => { - return (StatusCode::NOT_FOUND, "app not found").into_response(); - } - Err(e) => { - return (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response(); - } - } - } + // let view = self.view.clone(); + // if !view.p2p().nodes_config.this.1.is_master() { + // tracing::debug!("this is_master"); + // match self.view.appmeta_manager().app_available(app).await { + // Ok(true) => {} + // Ok(false) => { + // return (StatusCode::NOT_FOUND, "app not found").into_response(); + // } + // Err(e) => { + // return (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response(); + // } + // } + // } // check app is available // match self.view.appmeta_manager().app_available(app).await { diff --git a/src/main/src/master/m_master.rs b/src/main/src/master/m_master.rs index 1e12724..daa0a06 100644 --- a/src/main/src/master/m_master.rs +++ b/src/main/src/master/m_master.rs @@ -1,6 +1,7 @@ use std::{ collections::hash_map::DefaultHasher, hash::Hasher, + mem::take, sync::atomic::{AtomicU32, Ordering}, time::Duration, }; @@ -13,18 +14,15 @@ use ws_derive::LogicalModule; use crate::{ config::NodesConfig, general::{ - app::{AppMetaManager, DataEventTrigger}, + app::{m_executor::Executor, AppMetaManager, DataEventTrigger}, network::{ m_p2p::{P2PModule, RPCCaller}, - proto::{ - self, - sche::{self, distribute_task_req::Trigger, DistributeTaskReq}, - }, + proto::{self, distribute_task_req::Trigger, DistributeTaskReq}, proto_ext::ProtoExtDataEventTrigger, }, }, logical_module_view_impl, - result::{WSResult, WsFuncError}, + result::{WSResult, WSResultExt, WsFuncError}, sys::{LogicalModule, LogicalModuleNewArgs, LogicalModulesRef, NodeID}, util::JoinHandleWrapper, }; @@ -86,6 +84,7 @@ logical_module_view_impl!(MasterView); logical_module_view_impl!(MasterView, p2p, P2PModule); logical_module_view_impl!(MasterView, master, Option); logical_module_view_impl!(MasterView, appmeta_manager, AppMetaManager); +logical_module_view_impl!(MasterView, executor, Executor); #[derive(Clone)] pub struct FunctionTriggerContext { @@ -95,13 +94,16 @@ pub struct FunctionTriggerContext { pub target_nodes: Vec, pub timeout: Duration, pub event_type: DataEventTrigger, + pub src_task_id: proto::FnTaskId, } #[derive(LogicalModule)] pub struct Master { - pub rpc_caller_distribute_task: RPCCaller, + pub rpc_caller_distribute_task: RPCCaller, + rpc_caller_add_wait_target: RPCCaller, + view: MasterView, - task_id_allocator: AtomicU32, + // task_id_allocator: AtomicU32, ope_id_allocator: AtomicU32, } @@ -114,13 +116,14 @@ impl LogicalModule for Master { Self { view: MasterView::new(args.logical_modules_ref.clone()), rpc_caller_distribute_task: RPCCaller::default(), - task_id_allocator: AtomicU32::new(0), ope_id_allocator: AtomicU32::new(0), + rpc_caller_add_wait_target: RPCCaller::default(), } } async fn start(&self) -> WSResult> { tracing::info!("start as master"); self.rpc_caller_distribute_task.regist(&self.view.p2p()); + self.rpc_caller_add_wait_target.regist(&self.view.p2p()); Ok(vec![]) } @@ -156,31 +159,31 @@ impl Master { pub async fn handle_http_schedule(&self, _app: &str) -> NodeID { self.select_node() } - pub async fn schedule_one_trigger(&self, app: String, func: String, trigger_data: Trigger) { - match self - .view - .master() - .rpc_caller_distribute_task - .call( - //理解 - self.view.p2p(), - self.select_node(), - DistributeTaskReq { - app, - func, - task_id: 0, // TODO: Context task id for one request - trigger: Some(trigger_data), - }, - Duration::from_secs(60).into(), - ) - .await - { - Ok(_) => {} - Err(err) => { - tracing::error!("schedule_one_trigger err: {:?}", err); - } - } - } + // pub async fn schedule_one_trigger(&self, app: String, func: String, trigger_data: Trigger) { + // match self + // .view + // .master() + // .rpc_caller_distribute_task + // .call( + // //理解 + // self.view.p2p(), + // self.select_node(), + // DistributeTaskReq { + // app, + // func, + // task_id: 0, // TODO: Context task id for one request + // trigger: Some(trigger_data), + // }, + // Duration::from_secs(60).into(), + // ) + // .await + // { + // Ok(_) => {} + // Err(err) => { + // tracing::error!("schedule_one_trigger err: {:?}", err); + // } + // } + // } pub fn select_node(&self) -> NodeID { let workers = self.view.p2p().nodes_config.get_worker_nodes(); let mut rng = rand::thread_rng(); @@ -224,28 +227,66 @@ impl Master { } // Generate task and operation IDs - let task_id = self.task_id_allocator.fetch_add(1, Ordering::Relaxed); + let task_id = self.view.executor().register_sub_task(); let opeid = self.ope_id_allocator.fetch_add(1, Ordering::Relaxed); // Create trigger using the ProtoExtDataEventTrigger trait - let trigger = DataEventTrigger::Write.into_proto_trigger(ctx.data_unique_id, opeid); + let trigger = DataEventTrigger::Write.into_proto_trigger(ctx.data_unique_id.clone(), opeid); // Create and send tasks to target nodes + let mut each_node_calling = vec![]; for &node in &ctx.target_nodes { - let req = sche::DistributeTaskReq { - app: ctx.app_name.clone(), - func: ctx.fn_name.clone(), - task_id, - trigger: Some(trigger.clone()), - }; - - // Send request with timeout - let _ = tokio::time::timeout( - ctx.timeout, - self.rpc_caller_distribute_task - .call(self.view.p2p(), node, req, Some(ctx.timeout)), - ) - .await; + let view = self.view.clone(); + let ctx = ctx.clone(); + let task_id = task_id.clone(); + let trigger = trigger.clone(); + // let src_task_id = ctx.src_task_id.clone(); + // let timeout = ctx.timeout; + let t = tokio::spawn(async move { + // before trigger function, add wait target to src node + let _ = view + .master() + .rpc_caller_add_wait_target + .call( + view.p2p(), + ctx.src_task_id.call_node_id, + proto::AddWaitTargetReq { + src_task_id: ctx.src_task_id.task_id, + sub_task_id: Some(task_id.clone()), + task_run_node: node, + }, + Some(ctx.timeout), + ) + .await + .todo_handle("call add wait target rpc failed"); + // ctx.src_task_id + + let req = proto::DistributeTaskReq { + app: ctx.app_name.clone(), + func: ctx.fn_name.clone(), + task_id: Some(task_id.clone()), + trigger: Some(trigger.clone()), + trigger_src_task_id: Some(ctx.src_task_id.clone()), + }; + + // Send request with timeout + // let _ = tokio::time::timeout( + // ctx.timeout, + let _ = view + .master() + .rpc_caller_distribute_task + .call(view.p2p(), node, req, Some(ctx.timeout)) + .await + .todo_handle("fddg trigger func call rpc failed"); + }); + each_node_calling.push(t); + // ) + // .await; + } + + for t in each_node_calling { + let _ = t.await; + // .todo_handle("fddg trigger func call rpc failed"); } Ok(()) diff --git a/src/main/src/modules_global_bridge/mod.rs b/src/main/src/modules_global_bridge/mod.rs index bf86c32..2826a7a 100644 --- a/src/main/src/modules_global_bridge/mod.rs +++ b/src/main/src/modules_global_bridge/mod.rs @@ -1,9 +1,9 @@ -use std::future::Future; -use crate::result::WSError;//虞光勇修改,修改内容:增加use crate::result::WSError;来导入 WSError。 -use crate::result::WsRuntimeErr;//虞光勇修改,修改内容:增加use crate::result::WsRuntimeErr;来导入 WsRuntimeErr。 +use crate::result::WSError; //虞光勇修改,修改内容:增加use crate::result::WSError;来导入 WSError。 use crate::result::WSResult; +use crate::result::WsRuntimeErr; //虞光勇修改,修改内容:增加use crate::result::WsRuntimeErr;来导入 WsRuntimeErr。 use crate::sys::LogicalModules; use crate::sys::LogicalModulesRef; +use std::future::Future; pub mod process_func; @@ -15,21 +15,21 @@ tokio::task_local! { static MODULES_REF: LogicalModulesRef; } -pub fn try_get_modules_ref() -> WSResult { - //没有处理try_wth的错误返回 曾俊 - // let mut res=Err(WSError::WsRuntimeErr(WsRuntimeErr::ModulesRefOutofLifetime)); - // MODULES_REF.try_with(|m|{ - // res=Ok(m.clone()); - // }); - // res +// pub fn try_get_modules_ref() -> WSResult { +// //没有处理try_wth的错误返回 曾俊 +// // let mut res=Err(WSError::WsRuntimeErr(WsRuntimeErr::ModulesRefOutofLifetime)); +// // MODULES_REF.try_with(|m|{ +// // res=Ok(m.clone()); +// // }); +// // res - MODULES_REF.try_with(|m| { - // 克隆 m 并返回 Ok 结果 - Ok(m.clone()) - }) - // 如果 try_with 失败,则返回相应的错误 - .map_err(|_e| WSError::WsRuntimeErr(WsRuntimeErr::ModulesRefOutofLifetime))? -} +// MODULES_REF.try_with(|m| { +// // 克隆 m 并返回 Ok 结果 +// Ok(m.clone()) +// }) +// // 如果 try_with 失败,则返回相应的错误 +// .map_err(|_e| WSError::WsRuntimeErr(WsRuntimeErr::ModulesRefOutofLifetime))? +// } //没有处理scope的返回值 曾俊 // pub fn modules_ref_scope(modules_ref: LogicalModulesRef,future: impl Future) { @@ -39,9 +39,11 @@ pub async fn modules_ref_scope(modules_ref: LogicalModulesRef, future: F) where F: Future + 'static, { - MODULES_REF.scope(modules_ref, async move { - let _ = future.await; - }).await; + MODULES_REF + .scope(modules_ref, async move { + let _ = future.await; + }) + .await; } fn modules() -> &'static LogicalModules { diff --git a/src/main/src/result.rs b/src/main/src/result.rs index 53ff8ff..8f746a4 100644 --- a/src/main/src/result.rs +++ b/src/main/src/result.rs @@ -11,8 +11,8 @@ use zip_extract::ZipExtractError; use crate::{ general::{ - app::FnMeta, - data::m_data_general::{DataItemIdx, DataSplitIdx, EachNodeSplit}, + app::{m_executor::EventCtx, AppMeta, FnMeta}, + data::m_data_general::{DataItemIdx, DataSetMetaV2, DataSplitIdx, EachNodeSplit}, network::{proto, rpc_model::HashValue}, }, sys::NodeID, @@ -31,9 +31,7 @@ pub enum WsNetworkLogicErr { DecodeError(DecodeError), MsgIdNotDispatchable(u32), InvaidNodeID(NodeID), - TaskJoinError { - err: tokio::task::JoinError - }, + TaskJoinError { err: tokio::task::JoinError }, } #[derive(Debug)] @@ -139,6 +137,11 @@ pub enum WsFuncError { app: String, func: String, }, + AppPackLoadFailed { + app: String, + err: Option>, + context: String, + }, InvalidHttpUrl(String), FuncHttpNotSupported { fname: String, @@ -152,6 +155,13 @@ pub enum WsFuncError { func: String, http_err: reqwest::Error, }, + FuncTriggerAppInvalid { + key: Vec, + /// only when parse success + // app: Option, + appmeta: Option<(String, Option<(AppMeta, Option)>)>, + context: String, + }, AppPackFailedZip(ZipExtractError), AppPackNoExe, AppPackExeName(String), @@ -175,6 +185,11 @@ pub enum WsFuncError { InstanceProcessStartFailed(std::io::Error), InsranceVerifyFailed(String), UnsupportedAppType, + InvalidTriggerForAppFunction { + app: String, + func: String, + trigger_type: EventCtx, + }, } #[derive(Debug)] @@ -228,6 +243,9 @@ pub enum WsDataError { actual: proto::data_item::DataItemDispatch, context: String, }, + FileNotFound { + path: PathBuf, + }, FileMetadataErr { path: PathBuf, err: std::io::Error, @@ -253,7 +271,7 @@ pub enum WsDataError { path: String, err: Infallible, }, - UnzipErr{ + UnzipErr { path: PathBuf, err: ZipExtractError, }, @@ -313,11 +331,11 @@ pub enum WsDataError { actual: u64, }, SizeMismatch { - expected: usize, // 预期的数据大小 - actual: usize, // 实际的数据大小 + expected: usize, // 预期的数据大小 + actual: usize, // 实际的数据大小 }, ReadDataFailed { - path: PathBuf, // 读取失败的文件路径 + path: PathBuf, // 读取失败的文件路径 }, /// 数据分片任务错误 DataSplitTaskError { @@ -330,6 +348,22 @@ pub enum WsDataError { /// 数据类型(用于调试) data_type: String, }, + /// 传输目录持久化tmp压缩文件失败 + TransferDirCreateTmpFileFailed { + path: PathBuf, + err: std::io::Error, + context: String, + }, + /// 传输目录保持tmp压缩文件失败 + TransferDirPersistTmpFileKeepFailed { + path: PathBuf, + err: tempfile::PersistError, + context: String, + }, + FileCreateErr { + path: PathBuf, + err: std::io::Error, + }, } #[derive(Error, Debug)] @@ -461,7 +495,7 @@ impl_err_convertor!(InitializeError, WsRaftErr, InitializeError); impl_err_convertor!(RaftError, WsRaftErr, RaftError); impl_err_convertor!(std::io::Error, WsIoErr, Io); -pub trait WSResultExt :Sized { +pub trait WSResultExt: Sized { fn todo_handle(self, err_comment: &str) -> Self; } @@ -485,4 +519,3 @@ impl WSResultExt for WSError { self } } - diff --git a/src/main/src/sys.rs b/src/main/src/sys.rs index 40937c3..4296248 100644 --- a/src/main/src/sys.rs +++ b/src/main/src/sys.rs @@ -37,11 +37,11 @@ impl Drop for Sys { impl Sys { pub fn new(config: NodesConfig) -> Sys { - // chdir to file_path - std::env::set_current_dir(&config.file_dir).unwrap(); - tracing::info!("Running at dir: {:?}", std::env::current_dir()); - + // std::env::set_current_dir(&config.file_dir) + // .unwrap_or_else(|err| panic!("failed to start sys at {}", config.file_dir)); + tracing::info!("Running at dir: {:?}", config.file_dir); + Sys { logical_modules: LogicalModules::new(config), sub_tasks: Vec::new().into(), @@ -51,7 +51,7 @@ impl Sys { pub fn new_logical_modules_ref(&self) -> LogicalModulesRef { LogicalModulesRef::new(self.logical_modules.clone()) } - + pub async fn wait_for_end(&mut self) { if let Err(err) = (*self.logical_modules).as_ref().unwrap().start(self).await { panic!("start logical nodes error: {:?}", err); @@ -189,6 +189,9 @@ macro_rules! logical_module_view_impl { pub fn new(inner: LogicalModulesRef) -> Self { $module { inner } } + pub fn copy_module_ref(&self) -> LogicalModulesRef { + self.inner.clone() + } // fn setup(&mut self, modules: Arc) { // self.inner.setup(modules); // } diff --git a/src/main/src/util/command.rs b/src/main/src/util/command.rs new file mode 100644 index 0000000..e819837 --- /dev/null +++ b/src/main/src/util/command.rs @@ -0,0 +1,54 @@ +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::{Child, Command}, +}; + +use crate::result::WSResult; + +pub trait CommandDebugStdio { + async fn spawn_debug( + &mut self, + ) -> WSResult<( + tokio::task::JoinHandle, + tokio::task::JoinHandle, + Child, + )>; +} + +impl CommandDebugStdio for Command { + async fn spawn_debug( + &mut self, + ) -> WSResult<( + tokio::task::JoinHandle, + tokio::task::JoinHandle, + Child, + )> { + let mut child = self.spawn()?; + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + + // 分别处理 stdout 和 stderr + let mut stdout_reader = BufReader::new(stdout).lines(); + let mut stderr_reader = BufReader::new(stderr).lines(); + + let stdout_task = tokio::spawn(async move { + let mut all = String::new(); + while let Ok(Some(line)) = stdout_reader.next_line().await { + println!("[STDOUT] {}", line); + all += &format!("[STDOUT] {}\n", line); + } + all + }); + + let stderr_task = tokio::spawn(async move { + let mut all = String::new(); + while let Ok(Some(line)) = stderr_reader.next_line().await { + eprintln!("[STDERR] {}", line); + all += &format!("[STDERR] {}\n", line); + } + all + }); + + Ok((stdout_task, stderr_task, child)) + } +} diff --git a/src/main/src/util/mod.rs b/src/main/src/util/mod.rs index 221b76a..7c98602 100644 --- a/src/main/src/util/mod.rs +++ b/src/main/src/util/mod.rs @@ -1,3 +1,4 @@ +pub mod command; pub mod container; pub mod zip; @@ -181,6 +182,12 @@ unsafe impl Send for FutureWrapper where F: Future {} pub struct SendNonNull(pub NonNull); unsafe impl Send for SendNonNull {} +impl SendNonNull { + pub fn as_mut(&self) -> &mut T { + unsafe { &mut *self.0.as_ptr() } + } +} + pub fn call_async_from_sync(fut: Fut) -> Fut::Output where Fut: std::future::Future + 'static, diff --git a/src/main/src/util/zip.rs b/src/main/src/util/zip.rs index 791bc69..27dcf03 100644 --- a/src/main/src/util/zip.rs +++ b/src/main/src/util/zip.rs @@ -1,10 +1,12 @@ -use std::path::Path; -use std::io::{self, Write, Seek, Cursor,Read}; +use crate::result::{WSError, WSResult, WsIoErr}; use std::fs; +use std::io::{self, Cursor, Read, Seek, Write}; +use std::os::unix::fs::PermissionsExt; +use std::path::Path; use walkdir::WalkDir; -use zip::{write::FileOptions, ZipWriter, result::ZipError}; -use crate::result::{WSResult, WSError, WsIoErr}; -use std::os::unix::fs::PermissionsExt; // 添加这一行以引入PermissionsExt trait 针对下方的.mode()报错 曾俊 +use zip::{result::ZipError, write::FileOptions, ZipWriter}; + +use super::{non_null, SendNonNull}; // 添加这一行以引入PermissionsExt trait 针对下方的.mode()报错 曾俊 pub fn unzip_data_2_path(p: impl AsRef, data: Vec) -> WSResult<()> { // remove old dir @@ -64,7 +66,7 @@ where .metadata() .map_err(|e| WSError::from(e))? .permissions() - .mode(), // 修改!!! 在文件上方导入了一个PermissionsExt trait 曾俊 + .mode(), // 修改!!! 在文件上方导入了一个PermissionsExt trait 曾俊 ); // Write file or directory explicitly @@ -93,10 +95,7 @@ where Ok(()) } -pub fn zip_dir_2_mem( - src_dir: &Path, - method: zip::CompressionMethod, -) -> WSResult> { +pub fn zip_dir_2_mem(src_dir: &Path, method: zip::CompressionMethod) -> WSResult> { if !src_dir.is_dir() { return Err(WsIoErr::Zip2(ZipError::FileNotFound).into()); } @@ -118,7 +117,7 @@ pub fn zip_dir_2_mem( pub async fn zip_dir_2_file( src_dir: impl AsRef, method: zip::CompressionMethod, - mut dst_file: std::fs::File, + dst_file: &mut std::fs::File, ) -> WSResult<()> { // // if !src_dir.is_dir() { //泛型参数不会自动解引用 曾俊 // if !src_dir.as_ref().is_dir() { @@ -134,18 +133,23 @@ pub async fn zip_dir_2_file( let it = walkdir.into_iter(); // 使用阻塞线程执行 zip 操作,因为 zip 库不支持异步 IO + let dst_file_ptr = unsafe { SendNonNull(non_null(dst_file)) }; tokio::task::spawn_blocking(move || { zip_dir( &mut it.filter_map(|e| e.ok()), - // src_dir, //泛型参数不会自动解引用 曾俊 + // src_dir, //泛型参数不会自动解引用 曾俊 src_dir.as_ref(), - &mut dst_file, + unsafe { dst_file_ptr.as_mut() }, method, ) - }).await.map_err(|e| WsIoErr::Io(std::io::Error::new( - std::io::ErrorKind::Other, - format!("Failed to execute zip task: {}", e) - )))??; + }) + .await + .map_err(|e| { + WsIoErr::Io(std::io::Error::new( + std::io::ErrorKind::Other, + format!("Failed to execute zip task: {}", e), + )) + })??; Ok(()) } @@ -169,15 +173,16 @@ mod tests { // 创建临时输出文件 let mut output_file = NamedTempFile::new()?; - + // 执行压缩 tokio::runtime::Runtime::new()?.block_on(async { zip_dir_2_file( src_path, zip::CompressionMethod::Stored, - output_file, + output_file.as_file_mut(), // output_file.as_file_mut(), - ).await + ) + .await })?; // 读取压缩后的数据 @@ -185,7 +190,7 @@ mod tests { // 创建临时解压目录 let extract_dir = tempdir()?; - + // 执行解压 unzip_data_2_path(extract_dir.path(), zip_data)?; @@ -221,14 +226,15 @@ mod tests { // 创建临时输出文件 let mut output_file = NamedTempFile::new()?; - + // 执行压缩 tokio::runtime::Runtime::new()?.block_on(async { zip_dir_2_file( src_path, zip::CompressionMethod::Stored, output_file.as_file_mut(), - ).await + ) + .await })?; // 读取压缩后的数据 @@ -236,7 +242,7 @@ mod tests { // 创建临时解压目录 let extract_dir = tempdir()?; - + // 执行解压 unzip_data_2_path(extract_dir.path(), zip_data)?; @@ -253,17 +259,18 @@ mod tests { fn test_zip_empty_directory() -> WSResult<()> { // 创建空临时目录 let src_dir = tempdir()?; - + // 创建临时输出文件 let mut output_file = NamedTempFile::new()?; - + // 执行压缩 tokio::runtime::Runtime::new()?.block_on(async { zip_dir_2_file( src_dir.path(), zip::CompressionMethod::Stored, output_file.as_file_mut(), - ).await + ) + .await })?; // 读取压缩后的数据 @@ -271,7 +278,7 @@ mod tests { // 创建临时解压目录 let extract_dir = tempdir()?; - + // 执行解压 unzip_data_2_path(extract_dir.path(), zip_data)?; diff --git a/src/main/src/worker/m_kv_user_client.rs b/src/main/src/worker/m_kv_user_client.rs index 8602458..fac8f0a 100644 --- a/src/main/src/worker/m_kv_user_client.rs +++ b/src/main/src/worker/m_kv_user_client.rs @@ -3,9 +3,8 @@ use crate::{ general::{ data::{ m_data_general::{ - new_data_unique_id_fn_kv, DataGeneral, DataItemIdx, DataSetMetaV2, GetOrDelDataArg, - GetOrDelDataArgType, - dataitem::DataItemArgWrapper + dataitem::DataItemArgWrapper, new_data_unique_id_fn_kv, DataGeneral, DataItemIdx, + DataSetMetaV2, GetOrDelDataArg, GetOrDelDataArgType, }, m_dist_lock::DistLock, }, @@ -201,37 +200,42 @@ impl KvUserClient { async fn handle_kv_set( &self, - app_name: &str, - func_name: &str, - set: proto::kv::kv_request::KvPutRequest, + _app_name: &str, + _func_name: &str, + _set: proto::kv::kv_request::KvPutRequest, ) -> KvResponse { - let proto::kv::KvPair { key, value } = set.kv.unwrap(); - let cur_node = self.view.p2p().nodes_config.this_node(); - tracing::debug!("handle_kv_set: key: {:?}", key); + // let proto::kv::KvPair { key, value } = set.kv.unwrap(); + // let cur_node = self.view.p2p().nodes_config.this_node(); + // tracing::debug!("handle_kv_set: key: {:?}", key); - let data_general = self.view.data_general(); + // let data_general = self.view.data_general(); //返回结果未处理 曾俊 - if let Err(e) = data_general - .write_data( - new_data_unique_id_fn_kv(&key), - //原代码: - // vec![proto::DataItem { - // data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(value)), - // }], - //修改后封装成要求的DataItemArgWrapper类型 tmpzipfile设置为Uninitialized状态 在DataItemArgWrapper结构体中添加了一个new方法 曾俊 - vec![DataItemArgWrapper::new(value)], - Some(( - cur_node, - proto::DataOpeType::Write, - proto::data_schedule_context::OpeRole::FuncCall(proto::DataOpeRoleFuncCall { - app_func: format!("{}/{}", app_name, func_name), - node_id: cur_node, - }), - )), - ) - .await{ - tracing::error!("Failed to write data: {}", e); - } + { + todo!() + // if let Err(e) = data_general + // .write_data( + // new_data_unique_id_fn_kv(&key), + // //原代码: + // // vec![proto::DataItem { + // // data_item_dispatch: Some(proto::data_item::DataItemDispatch::RawBytes(value)), + // // }], + // //修改后封装成要求的DataItemArgWrapper类型 tmpzipfile设置为Uninitialized状态 在DataItemArgWrapper结构体中添加了一个new方法 曾俊 + // vec![DataItemArgWrapper::new(value)], + // Some(( + // cur_node, + // proto::DataOpeType::Write, + // proto::data_schedule_context::OpeRole::FuncCall(proto::DataOpeRoleFuncCall { + // app_func: format!("{}/{}", app_name, func_name), + // node_id: cur_node, + // }), + // )), + // ) + // .await + // { + // tracing::error!("Failed to write data: {}", e); + // } + } + // .todo_handle("This part of the code needs to be implemented."); KvResponse::new_common(vec![]) } @@ -242,7 +246,11 @@ impl KvUserClient { _meta: DataSetMetaV2, splits: HashMap, ) -> WSResult> { - tracing::debug!("convert_get_data_res_to_kv_response uid: {:?}, split keys: {:?}", uid, splits.keys().collect::>()); + tracing::debug!( + "convert_get_data_res_to_kv_response uid: {:?}, split keys: {:?}", + uid, + splits.keys().collect::>() + ); if splits.len() != 1 { return Err(WSError::WsDataError( WsDataError::KvGotWrongSplitCountAndIdx { @@ -291,7 +299,7 @@ impl KvUserClient { let data_general = self.view.data_general(); let uid = new_data_unique_id_fn_kv(&get.range.as_ref().unwrap().start); let got = data_general - .get_or_del_data(GetOrDelDataArg { + .get_or_del_datas(GetOrDelDataArg { meta: None, unique_id: uid.clone(), ty: GetOrDelDataArgType::All, @@ -328,7 +336,7 @@ impl KvUserClient { let data_general = self.view.data_general(); let uid = new_data_unique_id_fn_kv(&delete.range.as_ref().unwrap().start); let deleted = data_general - .get_or_del_data(GetOrDelDataArg { + .get_or_del_datas(GetOrDelDataArg { meta: None, unique_id: uid.clone(), ty: GetOrDelDataArgType::Delete, @@ -428,8 +436,8 @@ impl KvUserClient { #[cfg(test)] mod test { - - use std::{time::Duration}; + + use std::time::Duration; use super::KvUserClientView; use crate::general::{ diff --git a/telego/bin_waverless/deployment.yml b/telego/bin_waverless/deployment.yml deleted file mode 100644 index e8d3e41..0000000 --- a/telego/bin_waverless/deployment.yml +++ /dev/null @@ -1,20 +0,0 @@ -comment: 非常方便磁盘使用分析工具 - -prepare: - # x86 - - url: https://dev.yorhel.nl/download/ncdu-2.5-linux-x86_64.tar.gz - as: ncdu-2.5-linux-x86_64.tar.gz - trans: - - extract - - copy: - - ncdu: teledeploy/ncdu_amd64 - # arm - - url: https://dev.yorhel.nl/download/ncdu-2.5-linux-aarch64.tar.gz - as: ncdu-2.5-linux-aarch64.tar.gz - trans: - - extract - - copy: - - ncdu: teledeploy/ncdu_arm64 - -bin: - waverless: