Skip to content

Commit

Permalink
Merge pull request #18489 from singh264/openj9_issues_15806
Browse files Browse the repository at this point in the history
Configure the wait time for checkpoint safety
  • Loading branch information
tajila committed Dec 7, 2023
2 parents 8759109 + daf7103 commit b80ae24
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 50 deletions.
1 change: 1 addition & 0 deletions runtime/oti/j9nonbuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -4229,6 +4229,7 @@ typedef struct J9CRIUCheckpointState {
I_64 checkpointRestoreTimeDelta;
I_64 lastRestoreTimeMillis;
UDATA maxRetryForNotCheckpointSafe;
UDATA sleepMillisecondsForNotCheckpointSafe;
jclass criuJVMCheckpointExceptionClass;
jclass criuSystemCheckpointExceptionClass;
jclass criuJVMRestoreExceptionClass;
Expand Down
2 changes: 2 additions & 0 deletions runtime/oti/jvminit.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ enum INIT_STAGE {
#define VMOPT_XSHARECLASSES_DISABLEONRESTORE "-Xshareclasses:disableOnRestore"
#define VMOPT_XXENABLETHROWONDELAYECHECKPOINTOPERATION "-XX:+ThrowOnDelayedCheckpointOperation"
#define VMOPT_XXDISABLETHROWONDELAYECHECKPOINTOPERATION "-XX:-ThrowOnDelayedCheckpointOperation"
#define VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS "-XX:maxRetryForNotCheckpointSafe="
#define VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS "-XX:sleepMillisecondsForNotCheckpointSafe="
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

/* Compatibility options. */
Expand Down
3 changes: 2 additions & 1 deletion runtime/vm/CRIUHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,7 @@ criuCheckpointJVMImpl(JNIEnv *env,
UDATA success = 0;
bool safePoint = J9_ARE_ANY_BITS_SET(vm->extendedRuntimeFlags, J9_EXTENDED_RUNTIME_OSR_SAFE_POINT);
UDATA maxRetries = vm->checkpointState.maxRetryForNotCheckpointSafe;
UDATA sleepMilliseconds = vm->checkpointState.sleepMillisecondsForNotCheckpointSafe;
BOOLEAN syslogFlagNone = TRUE;
char *syslogOptions = NULL;
I_32 syslogBufferSize = 0;
Expand Down Expand Up @@ -1652,7 +1653,7 @@ criuCheckpointJVMImpl(JNIEnv *env,
for (UDATA i = 0; (0 != notSafeToCheckpoint) && (i <= maxRetries); i++) {
releaseSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint);
vmFuncs->internalExitVMToJNI(currentThread);
omrthread_nanosleep(10000);
omrthread_sleep(sleepMilliseconds);
vmFuncs->internalEnterVMFromJNI(currentThread);
acquireSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint);
notSafeToCheckpoint = checkIfSafeToCheckpoint(currentThread);
Expand Down
33 changes: 31 additions & 2 deletions runtime/vm/jvminit.c
Original file line number Diff line number Diff line change
Expand Up @@ -2760,6 +2760,37 @@ VMInitStages(J9JavaVM *vm, IDATA stage, void* reserved)
argIndex2 = FIND_NEXT_ARG_IN_VMARGS_FORWARD(STARTSWITH_MATCH, VMOPT_XXGLOBALLOCKRESERVATIONCOLON, NULL, argIndex2);
}

#if defined(J9VM_OPT_CRIU_SUPPORT)
vm->checkpointState.maxRetryForNotCheckpointSafe = 100;
if ((argIndex = FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS, NULL)) >= 0) {
UDATA maxRetryForNotCheckpointSafe = 0;
char *optname = VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS;
parseError = GET_INTEGER_VALUE(argIndex, optname, maxRetryForNotCheckpointSafe);
if (OPTION_OK != parseError) {
parseErrorOption = VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS;
goto _memParseError;
}
vm->checkpointState.maxRetryForNotCheckpointSafe = maxRetryForNotCheckpointSafe;
}

vm->checkpointState.sleepMillisecondsForNotCheckpointSafe = 10;
if ((argIndex = FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS, NULL)) >= 0) {
UDATA sleepMillisecondsForNotCheckpointSafe = 0;
char *optname = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
parseError = GET_INTEGER_VALUE(argIndex, optname, sleepMillisecondsForNotCheckpointSafe);
if (OPTION_OK != parseError) {
parseErrorOption = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
goto _memParseError;
}
if (sleepMillisecondsForNotCheckpointSafe < 1) {
parseErrorOption = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
parseError = OPTION_OUTOFRANGE;
goto _memParseError;
}
vm->checkpointState.sleepMillisecondsForNotCheckpointSafe = sleepMillisecondsForNotCheckpointSafe;
}
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

break;

case BYTECODE_TABLE_SET:
Expand Down Expand Up @@ -3924,8 +3955,6 @@ processVMArgsFromFirstToLast(J9JavaVM * vm)
}

vm->checkpointState.lastRestoreTimeMillis = -1;
/* Its unclear if we need an option for this, so we can keep the init here for the time being */
vm->checkpointState.maxRetryForNotCheckpointSafe = 100;
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

{
Expand Down
9 changes: 4 additions & 5 deletions test/functional/cmdLineTests/criu/criu_nonPortable.xml
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,8 @@
<output type="failure" caseSensitive="yes" regex="no">Could not dump the JVM processes, err=-70</output>
</test>

<!--
<test id="Create and Restore Criu Checkpoint Image once - MethodTypeDeadlockTest">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xtrace:print=j9vm.731 -fix-add-opens java.base/jdk.internal.misc=ALL-UNNAMED -fix-add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ MethodTypeDeadlockTest 1</command>
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -XX:sleepMillisecondsForNotCheckpointSafe=20 -Xtrace:print=j9vm.731 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ MethodTypeDeadlockTest 1</command>
<output type="success" caseSensitive="yes" regex="no">User requested Java dump using</output>
<output type="success" caseSensitive="yes" regex="no">TEST PASSED</output>
<output type="failure" caseSensitive="yes" regex="no">TEST FAILED</output>
Expand All @@ -303,15 +302,15 @@
<output type="required" caseSensitive="no" regex="no">Killed</output>
<output type="failure" caseSensitive="yes" regex="no">CRIU is not enabled</output>
<output type="failure" caseSensitive="yes" regex="no">Operation not permitted</output>
If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure.
<!-- If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. -->
<output type="success" caseSensitive="yes" regex="no">Thread pid mismatch</output>
<output type="success" caseSensitive="yes" regex="no">do not match expected</output>
<output type="success" caseSensitive="yes" regex="no">Unable to create a thread:</output>
<output type="failure" caseSensitive="yes" regex="no">TEST FAILED</output>
In the past, the failure below was caused by an issue where CRIU can't be found on the PATH.
<!-- In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. -->
<output type="failure" caseSensitive="yes" regex="no">Could not dump the JVM processes, err=-70</output>
</test>
-->

<test id="Create and Restore Criu Checkpoint Image once - clinitTest">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xdump:system:events=user -Xtrace:print=j9vm.731 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ ClinitTest 1</command>
<output type="success" caseSensitive="yes" regex="no">User requested Java dump using</output>
Expand Down
4 changes: 4 additions & 0 deletions test/functional/cmdLineTests/criu/playlist.xml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@
<comment>https://github.com/eclipse-openj9/openj9/issues/18468</comment>
<platform>ppc64le.*</platform>
</disable>
<disable>
<comment>https://github.com/eclipse-openj9/openj9/issues/18570</comment>
<platform>s390x.*</platform>
</disable>
</disables>
<command>
TR_Options=$(Q)exclude={org/openj9/criu/TimeChangeTest.nanoTimeInt()J},dontInline={org/openj9/criu/TimeChangeTest.nanoTimeInt()J|org/openj9/criu/TimeChangeTest.nanoTimeJit()J},{org/openj9/criu/TimeChangeTest.nanoTimeJit()J}(count=1)$(Q) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public static void checkpointDeadlock() {

Thread t1 = new Thread(() -> {
synchronized (lock) {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
Expand All @@ -95,7 +95,7 @@ public static void checkpointDeadlock() {
}
});

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -143,7 +143,7 @@ public static void notCheckpointSafeDeadlock() {
Thread t1 = new Thread(() -> {
Runnable run = () -> {
synchronized (lock) {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
Expand All @@ -160,7 +160,7 @@ public static void notCheckpointSafeDeadlock() {

CRIUSupport criuSupport = new CRIUSupport(path);

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand All @@ -187,7 +187,7 @@ public static void methodTypeDeadlockTest() {
Path path = Paths.get("cpData");
final TestResult testResult = new TestResult(true, 0);
Runnable run = () -> {
testResult.lockStatus++;
testResult.lockStatus.incrementAndGet();
for (int i = 0; i < 30; i++) {
URL[] urlArray = { A.class.getProtectionDomain().getCodeSource().getLocation() };
URLClassLoader loader = new URLClassLoader(urlArray);
Expand All @@ -203,7 +203,7 @@ public static void methodTypeDeadlockTest() {
thread.start();
}

while (testResult.lockStatus < 5) {
while (testResult.lockStatus.get() < 5) {
Thread.yield();
}

Expand Down Expand Up @@ -258,15 +258,15 @@ public static void clinitTest() {
Path path = Paths.get("cpData");

mainTestResult.testPassed = false;
mainTestResult.lockStatus = 0;
mainTestResult.lockStatus.set(0);

Thread t1 = new Thread(()->{
new ClinitDeadlock();
});

t1.start();

while (mainTestResult.lockStatus == 0) {
while (mainTestResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -295,15 +295,15 @@ public static void clinitTest2() {
Path path = Paths.get("cpData");

mainTestResult.testPassed = false;
mainTestResult.lockStatus = 0;
mainTestResult.lockStatus.set(0);

Thread t1 = new Thread(()->{
new ClinitDeadlock();
});

t1.start();

while (mainTestResult.lockStatus == 0) {
while (mainTestResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -331,7 +331,7 @@ public static void clinitTest2() {
static class ClinitDeadlock {

static {
mainTestResult.lockStatus = 1;
mainTestResult.lockStatus.set(1);
synchronized(lock) {
try {
System.out.println("Thread waiting");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ private void test(String testName) throws InterruptedException {
throw new RuntimeException("Unrecognized test name: " + testName);
}

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.currentThread().yield();
}
CRIUTestUtils.checkPointJVMNoSetup(criu, CRIUTestUtils.imagePath, false);
Expand Down Expand Up @@ -138,7 +138,7 @@ public static void showMessages(String logStr, long expectedTime, boolean isMill
private void testThreadParkHelper(String testName) {
CRIUTestUtils.showThreadCurrentTime(testName + " before park()");
final long startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
unsafe.park(false, nsTime5s);
final long endNanoTime = System.nanoTime();
CRIUTestUtils.showThreadCurrentTime(testName + " after park()");
Expand All @@ -160,7 +160,7 @@ private void testThreadSleepHelper(String testName) {
CRIUTestUtils.showThreadCurrentTime(testName + " before sleep()");
final long startNanoTime = System.nanoTime();
try {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
Thread.sleep(msTime5s);
final long endNanoTime = System.nanoTime();
CRIUTestUtils.showThreadCurrentTime(testName + " after sleep()");
Expand Down Expand Up @@ -191,7 +191,7 @@ public void run() {
final long startNanoTime;
synchronized (objWait) {
startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
objWait.wait();
}
final long endNanoTime = System.nanoTime();
Expand Down Expand Up @@ -220,7 +220,7 @@ private void testObjectWaitTimedHelper(String testName, long ms, int ns) {
final long startNanoTime;
synchronized (objWait) {
startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
objWait.wait(ms, ns);
}
final long endNanoTime = System.nanoTime();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,45 +118,45 @@ static void TestConcurrentModePreCheckpointHookPriorities() {
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in CONCURRENT_MODE");
// check if it is the initial value
if (testResult.lockStatus != 0) {
if (testResult.lockStatus.get() != 0) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in CONCURRENT_MODE
if (testResult.lockStatus != 1) {
if (testResult.lockStatus.get() != 1) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 2;
testResult.lockStatus.set(2);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_HIGH);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_HIGH in CONCURRENT_MODE
if (testResult.lockStatus != 2) {
if (testResult.lockStatus.get() != 2) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 3;
testResult.lockStatus.set(3);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in SINGLE_THREAD_MODE
if (testResult.lockStatus != 3) {
if (testResult.lockStatus.get() != 3) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 4;
testResult.lockStatus.set(4);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_HIGH);

Expand Down Expand Up @@ -217,45 +217,45 @@ static void TestConcurrentModePostRestoreHookPriorities() {
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in CONCURRENT_MODE
if (testResult.lockStatus != 3) {
if (testResult.lockStatus.get() != 3) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 4;
testResult.lockStatus.set(4);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in SINGLE_THREAD_MODE
if (testResult.lockStatus != 2) {
if (testResult.lockStatus.get() != 2) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 3;
testResult.lockStatus.set(3);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_HIGH);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_HIGH in SINGLE_THREAD_MODE
if (testResult.lockStatus != 1) {
if (testResult.lockStatus.get() != 1) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 2;
testResult.lockStatus.set(2);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in SINGLE_THREAD_MODE");
// check if it is the initial value
if (testResult.lockStatus != 0) {
if (testResult.lockStatus.get() != 0) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_HIGH);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
*******************************************************************************/
package org.openj9.criu;

import java.util.concurrent.atomic.AtomicInteger;

public class TestResult {
boolean testPassed;
volatile int lockStatus;
AtomicInteger lockStatus;

TestResult(boolean testPassed, int lockStatus) {
this.testPassed = testPassed;
this.lockStatus = lockStatus;
this.lockStatus = new AtomicInteger(lockStatus);
}
}

0 comments on commit b80ae24

Please sign in to comment.